def _train_model_fn(self, diter, x_placeholder, num_workers, num_features, global_num_samples, num_iterations, schema_params: SchemaParams): """ The training objective function and the gradients. """ value = tf1.constant(0.0, tf1.float64) # Add bias gradients = tf1.constant(np.zeros(num_features + 1)) feature_bag_name = self.feature_bag_name label_column_name = schema_params.label_column_name sample_weight_column_name = schema_params.weight_column_name offset_column_name = self.offset_column_name is_regularize_bias = self.is_regularize_bias has_weight = self._has_feature(sample_weight_column_name) has_offset = self._has_feature(offset_column_name) i = 0 def cond(i, value, gradients): return i < num_iterations def body(i, value, gradients): i += 1 all_features, all_labels = diter.get_next() labels = all_labels[label_column_name] current_batch_size = tf1.shape(labels)[0] features = self._get_feature_bag_tensor(all_features, feature_bag_name, current_batch_size) weights = all_features[sample_weight_column_name] if has_weight else tf1.ones(current_batch_size, tf1.float64) offsets = all_features[offset_column_name] if has_offset else tf1.zeros(current_batch_size, tf1.float64) w = x_placeholder[:-1] b = x_placeholder[-1] logits = tf1.sparse.sparse_dense_matmul(tf1.cast(features, tf1.float64), tf1.cast(tf1.expand_dims(w, 1), tf1.float64)) \ + tf1.expand_dims(tf1.ones(current_batch_size, tf1.float64) * tf1.cast(b, tf1.float64), 1) \ + tf1.expand_dims(tf1.cast(offsets, tf1.float64), 1) loss = tf1.nn.sigmoid_cross_entropy_with_logits(labels=tf1.cast(labels, tf1.float64), logits=tf1.reshape(tf1.cast(logits, tf1.float64), [-1])) weighted_loss = tf1.cast(weights, tf1.float64) * loss # regularzer has the option to include or exclude bias regularizer = tf1.nn.l2_loss(x_placeholder) if is_regularize_bias else tf1.nn.l2_loss(w) batch_value = tf1.reduce_sum(weighted_loss) + regularizer * self.l2_reg_weight \ * tf1.cast(current_batch_size, tf1.float64) / global_num_samples batch_gradients = tf1.gradients(batch_value, x_placeholder)[0] value += batch_value gradients += batch_gradients return i, value, gradients _, value, gradients = tf1.while_loop(cond, body, [i, value, gradients]) if num_workers > 1: # sum all reduce reduced_value = collective_ops.all_reduce( value, num_workers, FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY, 0, merge_op='Add', final_op='Id') reduced_gradients = collective_ops.all_reduce( gradients, num_workers, FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY, 1, merge_op='Add', final_op='Id') return reduced_value, reduced_gradients else: return value, gradients
def worker_fn(): cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() enable_collective_ops(cluster_resolver) collective_ops.all_reduce( constant_op.constant(1.), group_size=2, group_key=100, instance_key=100, merge_op="Add", final_op="Id", communication_hint="ring") if cluster_resolver.task_type == "worker": # MultiProcessRunner will auto restart worker-0. os._exit(1) # pylint: disable=protected-access else: # chief should eventually gets FailedPreconditionError after worker-0 # has restarted. while True: time.sleep(1) try: context.context().check_collective_ops_peer_health( "/job:worker/replica:0/task:0",) except errors.UnavailableError: pass except errors.FailedPreconditionError: break
def testConstantWithScopedAllocator(self): group_size = 2 group_key = 1 instance_key1 = 1 instance_key2 = 2 graph_options = config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( do_constant_folding=True)) cfg = config_pb2.ConfigProto(device_count={'CPU': group_size}, graph_options=graph_options) rewrite_options = cfg.graph_options.rewrite_options rewrite_options.scoped_allocator_optimization = ( rewriter_config_pb2.RewriterConfig.ON) del rewrite_options.scoped_allocator_opts.enable_op[:] rewrite_options.scoped_allocator_opts.enable_op.append( 'CollectiveReduce') with self.session(config=cfg) as sess: run_ops = [] for i in range(group_size): with ops.device('CPU:%d' % i): constant = constant_op.constant(i + 1.) input_tensor1 = array_ops.identity(constant) input_tensor2 = array_ops.identity(constant) reduced_tensor1 = collective_ops.all_reduce( input_tensor1, group_size, group_key, instance_key1, 'Add', 'Id') reduced_tensor2 = collective_ops.all_reduce( input_tensor2, group_size, group_key, instance_key2, 'Add', 'Id') run_ops.append(array_ops.identity(reduced_tensor1)) run_ops.append(array_ops.identity(reduced_tensor2)) results = sess.run(run_ops) self.assertEqual(results, [3., 3., 3., 3.])
def testCollectiveDeviceMismatch(self): group_key = 10 instance_key = 20 t0 = [1, 2, 3, 4] t1 = [5, 6, 7, 8] with ops.Graph().as_default(), self.session(config=self._configure( set_config_proto_nccl=False)) as sess: if not test_util.is_gpu_available(cuda_only=True): self.skipTest('No GPU available') with ops.device('/CPU:0'): in0 = constant_op.constant(t0) c0 = collective_ops.all_reduce(in0, self._group_size, group_key, instance_key, 'Add', 'Id') with ops.device('/GPU:0'): in1 = constant_op.constant(t1) c1 = collective_ops.all_reduce(in1, self._group_size, group_key, instance_key, 'Add', 'Id') run_options = config_pb2.RunOptions() run_options.experimental.collective_graph_key = 100 with self.assertRaisesRegex(errors.InternalError, 'but that group has type'): sess.run([c0, c1], options=run_options)
def testWhileWithScopedAllocator(self): group_size = 2 group_key = 1 instance_key0 = 1 instance_key1 = 2 config = config_pb2.ConfigProto(device_count={'CPU': group_size}) rewrite_options = config.graph_options.rewrite_options rewrite_options.scoped_allocator_optimization = ( rewriter_config_pb2.RewriterConfig.ON) del rewrite_options.scoped_allocator_opts.enable_op[:] rewrite_options.scoped_allocator_opts.enable_op.append( 'CollectiveReduce') # Tests that execute collectives need to be enclosed in graph or tf.function with ops.Graph().as_default(): with self.session(config=config) as sess: run_ops = [] for i in range(group_size): with ops.device('CPU:%d' % i): constant = constant_op.constant(0.) cond = lambda i: math_ops.less(i, 10.) body = lambda i: math_ops.add(i, 1.) input0 = control_flow_ops.while_loop( cond, body, [constant]) input1 = math_ops.add(constant, 5) colred0 = collective_ops.all_reduce( input0, group_size, group_key, instance_key0, 'Add', 'Id') colred1 = collective_ops.all_reduce( input1, group_size, group_key, instance_key1, 'Add', 'Id') run_ops.append(math_ops.add_n([colred0, colred1])) results = sess.run(run_ops) self.assertEqual(results, [30., 30.])
def _check_health(self, device, group_key, instance_key): first = True # We need to use a large enough value so that the all-reduce forms a # complete RING. In RING implementation, when value is too small, the # all-reduce may degrade into broadcasts. This means that some worker # failure may not be detected. value = array_ops.ones((32, 32), dtype=dtypes.float32) while True: if self._check_health_thread_should_stop.is_set(): return timeout = None if first: # For the first check health we set timeout since it may need to do # group resolution, which may hang if the cluster is never healthy. timeout = self._check_health_initial_timeout first = False try: # We use an dummy all-reduce as a way to check the health of a cluster. # For RING it should be able to detect failed workers in the cluster if # the values are large enough. # # We're not using CrossDeviceOps because we need to run it with # pre-allocated group and instance keys. # # TODO(b/151232436): Replace the reduce with a check health op once we # add that. with ops.device(device): collective_ops.all_reduce(value, group_size=self._num_workers, group_key=group_key, instance_key=instance_key, merge_op="Add", final_op="Id", subdiv_offsets=[0], communication_hint="ring", timeout=timeout) if context.is_async(): context.async_wait() except (errors.UnavailableError, errors.DeadlineExceededError, errors.FailedPreconditionError, errors.CancelledError) as e: # TODO(b/151232436): Always raise UnavailableError when a peer fails. # Now there could be many kinds of errors: # - Unavailable: when the peer is not reachable, e.g. it's down. # - FailedPrecondition: when the peer has restarted. # - DeadlineExceeded: when the first check health exceeds the deadline, # e.g. the peers take too long to be ready. # - Cancelled: when failures in organic collectives aborts first, # outgoing RPCs may be aborted with Cancelled. logging.error( "Cluster check alive failed, aborting collectives") context.context().abort_collective_ops( errors.UNAVAILABLE, "cluster check alive failed: %s" % e) except Exception as e: # pylint: disable=broad-except logging.exception("Unexpected exception in check alive.") context.context().abort_collective_ops( errors.INTERNAL, "unexecpted exception in check alive: %s" % e) return time.sleep(self._check_health_interval)
def build_collective_reduce(input_tensors, num_workers, collective_keys, reduction_op='Add', unary_op='Id', communication_hint='auto', control_inputs=None): """Build a subgraph that does one full all-reduce, using the collective Op. Args: input_tensors: tensors within a single worker graph that are to be reduced together; must be one per device. num_workers: total number of workers with identical independent graphs that will be doing this same reduction. The reduction will actually include the corresponding tensors at all these workers. collective_keys: a CollectiveKeys object. reduction_op: string naming the reduction op. unary_op: string naming the unary final op. communication_hint: string providing hint to runtime for choosing collective implementation. control_inputs: if not None, add control edges between control_inputs and (index-wise) corresponding collective_reduce tensors Returns: An array of final tensors, one per device, computed by the full reduction. Raises: ValueError: There must be at least two tensors over all the workers. """ group_size = len(input_tensors) * num_workers if group_size < 2: return input_tensors devices = [t.device for t in input_tensors] num_devices = len(devices) group_key = collective_keys.get_group_key(devices) instance_key = collective_keys.get_op_instance_key() subdiv_offsets = [0] # TODO(tucker): maybe support non-default subdiv spec if control_inputs: assert len(control_inputs) == len(input_tensors) out_tensors = [] for dev_idx in range(num_devices): with ops.device(devices[dev_idx]): if control_inputs: assert control_inputs[dev_idx].device == input_tensors[ dev_idx].device with ops.control_dependencies([control_inputs[dev_idx]]): reduce_op = collective_ops.all_reduce( input_tensors[dev_idx], group_size, group_key, instance_key, reduction_op, unary_op, subdiv_offsets, communication_hint) else: reduce_op = collective_ops.all_reduce(input_tensors[dev_idx], group_size, group_key, instance_key, reduction_op, unary_op, subdiv_offsets, communication_hint) out_tensors.append(reduce_op) return out_tensors
def testAbortInstanceParamsResolution(self, device, communication): if communication == "NCCL": self.skipTest("b/171358086: cannot test multi worker NCCL") dev0 = "/device:%s:0" % device cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() enable_collective_ops_with_barrier(cluster_resolver) group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) # First perform a normal all-reduce to complete the group resolution. with ops.device(dev0): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key) # We use broadcast to test aborting instance resolution since only broadcast # waits for the group. if cluster_resolver.task_id == 1: def abort_fn(): time.sleep(2) context.context().abort_collective_ops(errors.UNAVAILABLE, "peer down") t = threading.Thread(target=abort_fn) t.start() # Use a different instance key to trigger another instance resolution. instance_key = 101 with self.assertRaisesRegex(errors.UnavailableError, "peer down"): # This hangs on params resolution since we're only launching one # collective for a group size of 2. with ops.device(dev0): collective_ops.broadcast_send(in_tensor, (1, ), dtypes.float32, group_size, group_key, instance_key) # After abortion, subsequent collectives should fail immediately. with self.assertRaisesRegex(errors.UnavailableError, "peer down"): with ops.device(dev0): collective_ops.broadcast_send(in_tensor, (1, ), dtypes.float32, group_size, group_key, instance_key) t.join() # Enable collective ops again in order to reset the collective executor. enable_collective_ops_with_barrier(cluster_resolver) # Reassign instance_key so that it's the same on each worker. instance_key = 100 with ops.device(dev0): if cluster_resolver.task_id == 0: collective_ops.broadcast_send(in_tensor, (1, ), dtypes.float32, group_size, group_key, instance_key) else: collective_ops.broadcast_recv( (1, ), dtypes.float32, group_size, group_key, instance_key)
def run_all_reduce(group_key, instance_key, merge_op): group_size = 2 t0 = [1., 20., 3., 40., 5.] t1 = [10., 2., 30., 4., 50.] os.environ['NCCL_DEBUG'] = 'INFO' os.environ['NCCL_LAUNCH_MODE'] = 'PARALLEL' with ops.device('/GPU:0'): in0 = constant_op.constant(t0) c0 = collective_ops.all_reduce(in0, group_size, group_key, instance_key, merge_op, final_op='Id', communication_hint='nccl') with ops.device('/GPU:1'): in1 = constant_op.constant(t1) c1 = collective_ops.all_reduce(in1, group_size, group_key, instance_key, merge_op, final_op='Id', communication_hint='nccl') return c0, c1
def testWhileWithScopedAllocator(self): group_size = 2 group_key = 1 instance_key0 = 1 instance_key1 = 2 config = config_pb2.ConfigProto(device_count={'CPU': group_size}) rewrite_options = config.graph_options.rewrite_options rewrite_options.scoped_allocator_optimization = ( rewriter_config_pb2.RewriterConfig.ON) del rewrite_options.scoped_allocator_opts.enable_op[:] rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce') with self.session(config=config) as sess: run_ops = [] for i in range(group_size): with ops.device('CPU:%d' % i): constant = constant_op.constant(0.) cond = lambda i: math_ops.less(i, 10.) body = lambda i: math_ops.add(i, 1.) input0 = control_flow_ops.while_loop(cond, body, [constant]) input1 = math_ops.add(constant, 5) colred0 = collective_ops.all_reduce(input0, group_size, group_key, instance_key0, 'Add', 'Id') colred1 = collective_ops.all_reduce(input1, group_size, group_key, instance_key1, 'Add', 'Id') run_ops.append(math_ops.add_n([colred0, colred1])) results = sess.run(run_ops) self.assertEqual(results, [30., 30.])
def run_collective_device_mismatch(): with ops.device('/CPU:0'): in0 = constant_op.constant(t0) collective_ops.all_reduce(in0, self._group_size, group_key, instance_key, 'Add', 'Id') with ops.device('/GPU:0'): in1 = constant_op.constant(t1) collective_ops.all_reduce(in1, self._group_size, group_key, instance_key, 'Add', 'Id')
def testAbortNccl(self): self._setup_context(num_gpus=2) group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant(1.) # First perform a normal collective to finish resolution. def collective_fn(): for device in ['GPU:0', 'GPU:1']: with ops.device(device): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, 'Add', 'Id', communication_hint='nccl') def_function.function(collective_fn)() # Launch a collective that hangs, and abort the collective executor after # the launch. def abort_fn(): time.sleep(2) context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down') t = threading.Thread(target=abort_fn) t.start() with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, 'Add', 'Id', communication_hint='nccl') # After abortion, subsequent collectives should fail immediately. with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, 'Add', 'Id', communication_hint='nccl') t.join() # Reset the context in order to reset the collective executor. context._reset_context() # pylint: disable=protected-access def_function.function(collective_fn)()
def testNcclStress(self): self._setup_context(num_gpus=1) num_iters = 1000 for _ in range(num_iters): with ops.device('/device:GPU:0'): collective_ops.all_reduce( [1.], group_size=1, group_key=0, instance_key=0, merge_op='Add', final_op='Id', communication_hint='NCCL')
def testAbortInstanceParamsResolution(self): cpus = config.list_physical_devices('CPU') config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant(1.) def collective_fn(): for device in ['CPU:0', 'CPU:1']: with ops.device(device): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id', communication_hint='ring') # First perform a normal all-reduce to complete the group resolution. def_function.function(collective_fn)() def abort_fn(): time.sleep(2) context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down') t = threading.Thread(target=abort_fn) t.start() # Use a different instance key to trigger another instance resolution. instance_key = 101 with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): # This hangs on params resolution since we're only launching one # collective for a group size of 2. collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id') # After abortion, subsequent collectives should fail immediately. with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id') # Reset the context in order to reset the collective executor. context._reset_context() # pylint: disable=protected-access t.join() # After reset non-NCCL collectives should work. cpus = config.list_physical_devices('CPU') config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) def_function.function(collective_fn)()
def collective_fn(): for device in ['CPU:0', 'CPU:1']: with ops.device(device): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id', communication_hint='ring')
def run_all_reduce(): for device in ['CPU:0', 'CPU:1']: with ops.device(device): collective_ops.all_reduce(input_data, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id', timeout=timeout)
def testExecutionAfterTimeoutV2(self): timeout = 1.5 cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) context.ensure_initialized() group_key = 20 instance_key = 30 input_data = constant_op.constant([1, 2, 3, 4]) @def_function.function def run_all_reduce(): for device in ['CPU:0', 'CPU:1']: with ops.device(device): collective_ops.all_reduce(input_data, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id', timeout=timeout) # Run a normal all-reduce to complete param resolution. run_all_reduce() with self.assertRaisesRegex( errors.DeadlineExceededError, 'Collective has timed out during execution'): with ops.device('CPU:0'): collective_ops.all_reduce(input_data, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id', timeout=timeout) # We launch the second device after the first device times out. This is to # simulate the situation when other workers are slow and the timeout is # short. It should error immediately. with self.assertRaisesRegex( errors.DeadlineExceededError, 'Collective has timed out during execution'): with ops.device('CPU:1'): # No timeout. collective_ops.all_reduce(input_data, group_size=2, group_key=group_key, merge_op='Add', final_op='Id', instance_key=instance_key)
def testAbortRing(self): cpus = config.list_physical_devices('CPU') config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant(1.) # First perform a normal collective to finish resolution. def collective_fn(): for device in ['CPU:0', 'CPU:1']: with ops.device(device): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id', communication_hint='ring') def_function.function(collective_fn)() # Launch a collective that hangs, and abort the collective executor after # the launch. def abort_fn(): time.sleep(2) context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down') t = threading.Thread(target=abort_fn) t.start() with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id') # After abortion, subsequent collectives should fail immediately. with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id') # Reset the context in order to reset the collective executor. t.join() context._reset_context() # pylint: disable=protected-access # After reset non-NCCL collectives should work. cpus = config.list_physical_devices('CPU') config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) def_function.function(collective_fn)()
def testAbortCommunication(self, device, communication): if communication == "NCCL": self.skipTest("b/171358086: cannot test multi worker NCCL") dev0 = "/device:%s:0" % device cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() enable_collective_ops_with_barrier(cluster_resolver) group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) # First perform a normal all-reduce to complete the group and instance # resolution. with ops.device(dev0): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, communication_hint=communication) if cluster_resolver.task_id == 1: def abort_fn(): time.sleep(2) context.context().abort_collective_ops(errors.UNAVAILABLE, "peer down") t = threading.Thread(target=abort_fn) t.start() with self.assertRaisesRegex(errors.UnavailableError, "peer down"): with ops.device(dev0): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, communication_hint=communication) # After abortion, subsequent collectives should fail immediately. with self.assertRaisesRegex(errors.UnavailableError, "peer down"): with ops.device(dev0): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, communication_hint=communication) t.join() # Enable collective ops again in order to reset the collective executor. enable_collective_ops_with_barrier(cluster_resolver) with ops.device(dev0): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, communication_hint=communication)
def run_all_reduce(group_key, instance_key, merge_op): t0 = [1., 20., 3., 40., 5.] t1 = [10., 2., 30., 4., 50.] with ops.device('/GPU:0'): in0 = constant_op.constant(t0) c0 = collective_ops.all_reduce( in0, self._group_size, group_key, instance_key, merge_op, final_op='Id', communication_hint='nccl') with ops.device('/GPU:1'): in1 = constant_op.constant(t1) c1 = collective_ops.all_reduce( in1, self._group_size, group_key, instance_key, merge_op, final_op='Id', communication_hint='nccl') return c0, c1
def test_dist(): ts = [] for task_id in (0, 1): with tf.device('/job:worker/task:{0}/device:GPU:0'.format(task_id)): t = tf.Variable([1.0, 3.0 * task_id], dtype=tf.float32, name='myvar') ts.append(t) with tf.device('/job:worker/task:0/device:GPU:0'): sum0 = collective_ops.all_reduce(ts[0], 2, 0, 1, 'Add', 'Id') with tf.device('/job:worker/task:1/device:GPU:0'): sum1 = collective_ops.all_reduce(ts[1], 2, 0, 1, 'Add', 'Id') dependency = [sum0, sum1] result = [sum0, sum1] for i in range(20): with tf.control_dependencies(dependency): with tf.device('/job:worker/task:0/device:GPU:0'): sumb0 = collective_ops.all_reduce(tf.identity(ts[0]), 2, 0, i + 2, 'Add', 'Id') with tf.device('/job:worker/task:1/device:GPU:0'): sumb1 = collective_ops.all_reduce(tf.identity(ts[1]), 2, 0, i + 2, 'Add', 'Id') result.append(sumb0, sumb1) dependency = [sumb0, sumb1] resolver = TFConfigClusterResolver() cluster = resolver.cluster_spec() #dist = tf.distribute.experimental.MultiWorkerMirroredStrategy( # tf.distribute.experimental.CollectiveCommunication.NCCL) #sess_config = dist.update_config_proto(tf.ConfigProto()) #sess_config.ClearField("device_filters") sess_config = tf.ConfigProto() with open("dist_config.pbtxt", "r") as f: txt = f.read() pbtf.Parse(txt, sess_config) server = tf.distribute.Server(cluster, job_name="worker", task_index=0, config=sess_config) sess = tf.compat.v1.Session(server.target, config=sess_config) sess.run(tf.compat.v1.global_variables_initializer()) print('tensor value', sess.run(result)) with open("graph_def", "w") as f: f.write(str(tf.get_default_graph().as_graph_def()))
def _broadcast_fallback(self): """Sum gradients across devices using TensorFlow collective ops (slow fallback path).""" from tensorflow.python.ops import collective_ops # pylint: disable=no-name-in-module global _collective_ops_warning_printed, _collective_ops_group_key, _collective_ops_instance_key if all(x.shape.num_elements() == 0 for device in self._devices.values() for x in device.grad_clean.values()): return if not _collective_ops_warning_printed: print("------------------------------------------------------------------------") print("WARNING: Using slow fallback implementation for inter-GPU communication.") print("Please use TensorFlow 1.14 on Linux for optimal training performance.") print("------------------------------------------------------------------------") _collective_ops_warning_printed = True for device in self._devices.values(): with tf.device(device.name): combo = [tf.reshape(x, [x.shape.num_elements()]) for x in device.grad_clean.values()] combo = tf.concat(combo, axis=0) combo = collective_ops.all_reduce(combo, merge_op='Add', final_op='Id', group_size=len(self._devices), group_key=_collective_ops_group_key, instance_key=_collective_ops_instance_key) cur_ofs = 0 for var, grad_old in device.grad_clean.items(): grad_new = tf.reshape(combo[cur_ofs : cur_ofs + grad_old.shape.num_elements()], grad_old.shape) cur_ofs += grad_old.shape.num_elements() device.grad_clean[var] = grad_new _collective_ops_instance_key += 1
def _profile(self, devices): from tensorflow.python.ops import collective_ops id = self.seed self.seed += 1 result = [] for size in (2**i for i in range(21)): # 1 KB to 1GB handles = [] tf.reset_default_graph() for dev in devices: with tf.device(dev): x = tf.random.uniform((size, 128), dtype=tf.dtypes.float64) nccl = collective_ops.all_reduce(x, len(devices), id, id, 'Add', 'Id') handles.append(tf.identity(nccl)) run_meta = tf.compat.v1.RunMetadata() run_opt = tf.compat.v1.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) sess = tf.Session(self.target) sess.run(handles) sess.run(handles, options=run_opt, run_metadata=run_meta) time = min(node.all_end_rel_micros for d in run_meta.step_stats.dev_stats for node in d.node_stats if 'CollectiveReduce' in node.node_name) result.append((size, time)) return result
def _testCollectiveReduce(self, inputs, expected, set_graph_key, communication_hint='auto'): group_key = 1 group_size = len(inputs) instance_key = 1 device_type = 'CPU' config = config_pb2.ConfigProto(device_count={device_type: group_size}) devices = ['/{}:{}'.format(device_type, i) for i in range(group_size)] with self.session(config=config) as sess: colred = [] for i in range(group_size): with ops.device(devices[i]): tensor = constant_op.constant(inputs[i]) colred.append( collective_ops.all_reduce( tensor, group_size, group_key, instance_key, 'Add', 'Div', communication_hint=communication_hint)) run_options = config_pb2.RunOptions() if set_graph_key: run_options.experimental.collective_graph_key = 1 results = sess.run(colred, options=run_options) for i in range(group_size): self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
def all_reduce(self, input_tensor, control_input=None, communication_hint='AUTO', timeout=0): """All-reduce a dense tensor. This can be called in eager mode if a async executor is supplied when creating the launcher. Args: input_tensor: a dense tensor. It must have the same shape on all replicas. control_input: if not None, add control edges between control_input and the all-reduce. communication_hint: string providing hint to runtime for choosing collective implementation. timeout: a float. The timeout in seconds. Returns: The reduced tensor. """ instance_key = self._collective_keys.get_instance_key( self._group_key, self._device) with self._executor_scope(), \ ops.device(self._device), \ self._control_input(control_input): return collective_ops.all_reduce( input_tensor, self._group_size, self._group_key, instance_key, communication_hint=communication_hint, timeout=timeout)
def run_all_reduce(): group_key = 10 instance_key = 20 t0 = [1, 2, 3, 4] t1 = [5, 6, 7, 8] with ops.device('/CPU:0'): in0 = constant_op.constant(t0) c0 = collective_ops.all_reduce( in0, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id') with ops.device('/CPU:1'): in1 = constant_op.constant(t1) c1 = collective_ops.all_reduce( in1, group_size=3, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id') return c0, c1
def _testCollectiveReduce(self, inputs, expected, set_graph_key, communication_hint='auto', fp16=False, instance_key=1, merge_op='Add', final_op='Div'): group_key = 1 group_size = len(inputs) device_type = 'CPU' config = config_pb2.ConfigProto(device_count={device_type: group_size}) devices = ['/{}:{}'.format(device_type, i) for i in range(group_size)] with self.session(config=config) as sess: colred = [] for i in range(group_size): with ops.device(devices[i]): tensor = constant_op.constant(inputs[i], dtype=( dtypes.float16 if fp16 else dtypes.float32)) colred.append(collective_ops.all_reduce( tensor, group_size, group_key, instance_key, merge_op, final_op, communication_hint=communication_hint)) run_options = config_pb2.RunOptions() if set_graph_key: run_options.experimental.collective_graph_key = 1 results = sess.run(colred, options=run_options) tolerance = 1e-3 if fp16 else 1e-5 for i in range(group_size): logging.info('i {} result {} expected {}'.format(i, results[i], expected)) self.assertAllClose(results[i], expected, rtol=tolerance, atol=tolerance)
def testNcclHintAllReduce(self): inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1], [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]] expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2] group_size = len(inputs) group_key = 1 instance_key = 1 devices = ['/GPU:{}'.format(i) for i in range(group_size)] with self.session(config=self._configure( group_size, set_config_proto_nccl=False)) as sess: if not test_util.is_gpu_available(cuda_only=True): self.skipTest('No GPU available') collectives = [] for i in range(group_size): with ops.device(devices[i]): t = constant_op.constant(inputs[i]) collectives.append( collective_ops.all_reduce(t, group_size, group_key, instance_key, 'Add', 'Div', communication_hint='nccl')) results = sess.run(collectives) for result in results: self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
def testFp16Reduce(self): inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1], [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]] expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2] group_key = 1 instance_key = 100 devices = ['/GPU:{}'.format(i) for i in range(self._group_size)] with ops.Graph().as_default(), self.session( config=self._configure()) as sess: if not test_util.is_gpu_available(cuda_only=True): self.skipTest('No GPU available') collectives = [] for i in range(self._group_size): with ops.device(devices[i]): t = constant_op.constant(inputs[i], dtype=dtypes.float16) collectives.append( collective_ops.all_reduce(t, self._group_size, group_key, instance_key, 'Add', 'Div')) results = sess.run(collectives) for result in results: logging.info('i {} result {} expected {}'.format( i, results[i], expected)) self.assertAllClose(result, expected, rtol=1e-3, atol=1e-3)
def run_basic_all_reduce(): collectives = [] for i in range(self._group_size): with ops.device(self._devices[i]): t = constant_op.constant(inputs[i]) collectives.append(collective_ops.all_reduce( t, self._group_size, group_key, instance_key, 'Add', 'Div')) return collectives
def fn(all_args): results = [] # The inputs have no devices set. This is expected to be a trace-time # check only. self.assertEqual(all_args[0].device, '') self.assertEqual(all_args[1].device, '') with ops.device('/CPU:0'): results.append( collective_ops.all_reduce(all_args[0], group_size, group_key, instance_key, 'Add', 'Div')) with ops.device('/CPU:1'): results.append( collective_ops.all_reduce(all_args[1], group_size, group_key, instance_key, 'Add', 'Div')) return results
def _testCollectiveReduce(self, t0, t1, expected): group_key = 1 instance_key = 1 with self.test_session( config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess: with ops.device('/CPU:0'): in0 = constant_op.constant(t0) colred0 = collective_ops.all_reduce(in0, 2, group_key, instance_key, 'Add', 'Div', [0]) with ops.device('/CPU:1'): in1 = constant_op.constant(t1) colred1 = collective_ops.all_reduce(in1, 2, group_key, instance_key, 'Add', 'Div', [0]) run_options = config_pb2.RunOptions() run_options.experimental.collective_graph_key = 1 results = sess.run([colred0, colred1], options=run_options) self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5) self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
def collective_all_reduce(): """Call collective allreduce.""" assert not context.executing_eagerly() out_tensors = [] for d in range(num_devices): with ops.device(devices[d]): reduce_op = collective_ops.all_reduce( input_tensors[d], group_size, group_key, instance_key, reduction_op, unary_op, subdiv_offsets) out_tensors.append(reduce_op) return out_tensors
def _testMultipleConcurrentCollectiveReduce(self, t0, t1, expected): group_key = 1 group_size = 2 num_instances = 2 all_reduces = [] config = config_pb2.ConfigProto(device_count={'CPU': group_size}) config.experimental.collective_deterministic_sequential_execution = True with self.session(config=config) as sess: for cpu in range(group_size): with ops.device('/CPU:%d' % cpu): in_tensor = constant_op.constant(t0 if cpu == 0 else t1) for instance in range(num_instances): all_reduces.append(collective_ops.all_reduce( in_tensor, group_size, group_key, instance, 'Add', 'Div')) results = sess.run(all_reduces) for i in range(group_size * num_instances): self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
def build_collective_reduce(input_tensors, num_workers, collective_keys, reduction_op='Add', unary_op='Id'): """Build a subgraph that does one full all-reduce, using the collective Op. Args: input_tensors: tensors within a single worker graph that are to be reduced together; must be one per device. num_workers: total number of workers with identical independent graphs that will be doing this same reduction. The reduction will actually include the corresponding tensors at all these workers. collective_keys: a CollectiveKeys object. reduction_op: string naming the reduction op. unary_op: string naming the unary final op. Returns: An array of final tensors, one per device, computed by the full reduction. Raises: ValueError: There must be at least two tensors over all the workers. """ group_size = len(input_tensors) * num_workers if group_size < 2: raise ValueError('num_workers * len(input_tensors) must be 2 or greater') devices = [t.device for t in input_tensors] num_devices = len(devices) group_key = collective_keys.get_group_key(devices) instance_key = collective_keys.get_instance_key() out_tensors = [] subdiv_offsets = [0] # TODO(tucker): maybe support non-default subdiv spec for d in range(num_devices): with ops.device(devices[d]): reduce_op = collective_ops.all_reduce( input_tensors[d], group_size, group_key, instance_key, reduction_op, unary_op, subdiv_offsets) out_tensors.append(reduce_op) return out_tensors