def test_mpi_allreduce_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different rank or dimension.""" with mpi.Session() as session: rank = session.run(mpi.rank()) size = session.run(mpi.size()) # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension tf.set_random_seed(1234) dims = [17 + rank] * 3 tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(mpi.allreduce(tensor)) # Same number of elements, different rank tf.set_random_seed(1234) if rank == 0: dims = [17, 23 * 57] else: dims = [17, 23, 57] tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(mpi.allreduce(tensor))
def test_mpi_allreduce_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different rank or dimension.""" with self.test_session() as session: rank = session.run(mpi.rank()) size = session.run(mpi.size()) # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension tf.set_random_seed(1234) dims = [17 + rank] * 3 tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(mpi.allreduce(tensor)) # Same number of elements, different rank tf.set_random_seed(1234) if rank == 0: dims = [17, 23 * 57] else: dims = [17, 23, 57] tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(mpi.allreduce(tensor))
def test_mpi_allreduce_cpu(self): """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors.""" with mpi.Session() as session: size = session.run(mpi.size()) dtypes = [tf.int32, tf.float32] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tf.set_random_seed(1234) tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype) summed = mpi.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break diff = session.run(max_difference) self.assertTrue(diff <= threshold, "mpi.allreduce produces incorrect results")
def test_mpi_allreduce_cpu(self): """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors.""" with self.test_session() as session: size = session.run(mpi.size()) dtypes = [tf.int32, tf.float32] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tf.set_random_seed(1234) tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype) summed = mpi.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break diff = session.run(max_difference) self.assertTrue(diff <= threshold, "mpi.allreduce produces incorrect results")
def test_mpi_allgather(self): # Get MPI rank my_rank = int(os.environ['PMI_RANK']) num_ranks = int(os.environ['PMI_SIZE']) indices_per_rank = 100 tensor_width = 10 # Create IndexedSlices for each rank, some with overlapping indices. to_gather_indices = [] to_gather_values = [] to_gather = [] for rank_id in range(num_ranks): indices = [] values = [] my_multiple = rank_id + 1 current_index = my_multiple for i in range(indices_per_rank): indices.append(current_index) ones_tensor = tf.ones([tensor_width]) values.append( tf.multiply( ones_tensor, tf.fill(ones_tensor.get_shape(), float(current_index)))) current_index += my_multiple concat_ind = tf.stack(indices) concat_vals = tf.stack(values) to_gather_indices.append(concat_ind) to_gather_values.append(concat_vals) to_gather.append(tf.IndexedSlices(concat_vals, concat_ind)) # Collect the local IndexedSlices (indices and values) to create # correct IndexedSlices output. correct_gather_indices = tf.concat(to_gather_indices, 0) correct_gather_values = tf.concat(to_gather_values, 0) correct_gather = tf.IndexedSlices(correct_gather_values, correct_gather_indices) all_gather = mpi.allreduce(to_gather[my_rank], average_allgather) # NOTE: This assumes that device IDs are numbered the same as ranks. gpu_options = tf.GPUOptions(visible_device_list=str(my_rank)) config = tf.ConfigProto(gpu_options=gpu_options) # MPI Session to test allgather. with mpi.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) all_gathered, local_gathered = sess.run( [all_gather, correct_gather]) # Compare all_gathered with local_gathered. self.checkAllgather(num_ranks, all_gathered, local_gathered)
def test_mpi_allgather(self): # Get MPI rank my_rank = int(os.environ['PMI_RANK']) num_ranks = int(os.environ['PMI_SIZE']) indices_per_rank = 100 tensor_width = 10 # Create IndexedSlices for each rank, some with overlapping indices. to_gather_indices = [] to_gather_values = [] to_gather = [] for rank_id in range(num_ranks): indices = [] values = [] my_multiple = rank_id + 1 current_index = my_multiple for i in range(indices_per_rank): indices.append(current_index) ones_tensor = tf.ones([tensor_width]) values.append(tf.multiply(ones_tensor, tf.fill(ones_tensor.get_shape(), float(current_index)))) current_index += my_multiple concat_ind = tf.stack(indices) concat_vals = tf.stack(values) to_gather_indices.append(concat_ind) to_gather_values.append(concat_vals) to_gather.append(tf.IndexedSlices(concat_vals, concat_ind)) # Collect the local IndexedSlices (indices and values) to create # correct IndexedSlices output. correct_gather_indices = tf.concat(to_gather_indices, 0) correct_gather_values = tf.concat(to_gather_values, 0) correct_gather = tf.IndexedSlices(correct_gather_values, correct_gather_indices) all_gather = mpi.allreduce(to_gather[my_rank], average_allgather) # NOTE: This assumes that device IDs are numbered the same as ranks. gpu_options = tf.GPUOptions(visible_device_list=str(my_rank)) config = tf.ConfigProto(gpu_options=gpu_options) # MPI Session to test allgather. with mpi.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) all_gathered, local_gathered = sess.run([all_gather, correct_gather]) # Compare all_gathered with local_gathered. self.checkAllgather(num_ranks, all_gathered, local_gathered)
def test_mpi_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" with self.test_session() as session: rank = session.run(mpi.rank()) size = session.run(mpi.size()) # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(mpi.allreduce(tensor))
def test_mpi_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" with mpi.Session() as session: rank = session.run(mpi.rank()) size = session.run(mpi.size()) # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(mpi.allreduce(tensor))
def test_mpi_allreduce_gpu(self): """Test that the allreduce works on GPUs. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return no_gpus = tf.GPUOptions(visible_device_list="") cpu_config = tf.ConfigProto(gpu_options=no_gpus) with mpi.Session(config=cpu_config) as session: local_rank = session.run(mpi.local_rank()) one_gpu = tf.GPUOptions(visible_device_list=str(local_rank)) gpu_config = tf.ConfigProto(gpu_options=one_gpu) with mpi.Session(config=gpu_config) as session: size = session.run(mpi.size()) dtype = tf.float32 dim = 3 with tf.device("/gpu:0"): tf.set_random_seed(1234) tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype) summed = mpi.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return diff = session.run(max_difference) self.assertTrue(diff <= threshold, "mpi.allreduce on GPU produces incorrect results")
def test_mpi_allreduce_gpu(self): """Test that the allreduce works on GPUs. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return no_gpus = tf.GPUOptions(visible_device_list="") cpu_config = tf.ConfigProto(gpu_options=no_gpus) with self.test_session(config=cpu_config) as session: local_rank = session.run(mpi.local_rank()) one_gpu = tf.GPUOptions(visible_device_list=str(local_rank)) gpu_config = tf.ConfigProto(gpu_options=one_gpu) with self.test_session(config=gpu_config) as session: size = session.run(mpi.size()) dtype = tf.float32 dim = 3 with tf.device("/gpu:0"): tf.set_random_seed(1234) tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype) summed = mpi.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return diff = session.run(max_difference) self.assertTrue(diff <= threshold, "mpi.allreduce on GPU produces incorrect results")
def test_mpi_allreduce(self): # Get MPI rank my_rank = int(os.environ['PMI_RANK']) num_ranks = int(os.environ['PMI_SIZE']) stages = 13 batch_size = 1331 hidden_size = batch_size out_size = batch_size # Input placeholder (batch_size x hidden) - init to 1s inputs = tf.placeholder(tf.float32, shape=(batch_size, hidden_size), name="Input") # Large matrices (hidden x out_dim) - init random weights = [] for i in range(stages): initer = tf.constant_initializer(pow(2.0, i + 1.0)) weights.append(tf.get_variable("weights_{}".format(i), shape=(hidden_size, out_size), dtype=tf.float32, initializer=initer)) # Calculate output through dependent allreduces stage_input = inputs for i in range(stages): inter_output = tf.add(stage_input, weights[i], name="add_red_{}".format(i)) stage_input = mpi.allreduce(inter_output, average=average_allreduce) all_reduced = stage_input # Local reduced output for verification local_input = inputs for i in range(stages): inter_output = tf.add(local_input, weights[i], name="addin_loc_{}".format(i)) my_reducer = tf.Variable(initial_value=np.ones((hidden_size, out_size)), dtype=tf.float32, name="loc_redr_{}".format(i)) for r in range(num_ranks): my_reducer = tf.add(my_reducer, inter_output, name="add_loc_{}_{}".format(i, r)) if average_allreduce: local_input = tf.div(my_reducer, num_ranks, name="div_loc_{}".format(i)) else: local_input = my_reducer local_reduced = local_input # NOTE: This assumes that device IDs are numbered the same as ranks gpu_options = tf.GPUOptions(visible_device_list=str(my_rank)) config = tf.ConfigProto(gpu_options=gpu_options) # MPI Session to test allreduce with mpi.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) input_feed = np.ones((batch_size, hidden_size), dtype=np.float32) our_output = input_feed[0][0] spread_var = 100 input_feed = input_feed + my_rank * spread_var my_output = input_feed[0][0] for i in range(stages): curr_feed = my_output + pow(2.0, i + 1.0) my_output = curr_feed * num_ranks + 1 curr_our_feed = our_output + pow(2.0, i + 1.0) if i == 0: sum_ranks = num_ranks * (num_ranks - 1) / 2 our_output = curr_our_feed * num_ranks + \ spread_var * sum_ranks else: our_output = curr_our_feed * num_ranks print("rank {}: My output is {}".format(my_rank, my_output)) my_correct = np.zeros((batch_size, hidden_size), dtype=np.float32) my_correct = my_correct + my_output print("rank {}: Our output is {}".format(my_rank, our_output)) our_correct = np.zeros((batch_size, hidden_size), dtype=np.float32) our_correct = our_correct + our_output for i in range(1000): if i % 100 == 0: print("{}: iter {}".format(my_rank, i), flush=True) feed_dict = {inputs: input_feed} out_all_red, out_loc_red \ = sess.run([all_reduced, local_reduced], feed_dict=feed_dict) if not np.allclose(out_loc_red, my_correct) or \ not np.allclose(out_all_red, our_correct): print("Test incorrect on iter {}".format(i), flush=True) self.dumpFailure(my_rank, out_loc_red, my_correct, out_all_red, our_correct) assert(np.allclose(out_loc_red, my_correct) and np.allclose(out_all_red, our_correct))
def test_mpi_allreduce(self): # Get MPI rank my_rank = int(os.environ['PMI_RANK']) num_ranks = int(os.environ['PMI_SIZE']) stages = 13 batch_size = 1331 hidden_size = batch_size out_size = batch_size # Input placeholder (batch_size x hidden) - init to 1s inputs = tf.placeholder(tf.float32, shape=(batch_size, hidden_size), name="Input") # Large matrices (hidden x out_dim) - init random weights = [] for i in range(stages): initer = tf.constant_initializer(pow(2.0, i + 1.0)) weights.append( tf.get_variable("weights_{}".format(i), shape=(hidden_size, out_size), dtype=tf.float32, initializer=initer)) # Calculate output through dependent allreduces stage_input = inputs for i in range(stages): inter_output = tf.add(stage_input, weights[i], name="add_red_{}".format(i)) stage_input = mpi.allreduce(inter_output, average=average_allreduce) all_reduced = stage_input # Local reduced output for verification local_input = inputs for i in range(stages): inter_output = tf.add(local_input, weights[i], name="addin_loc_{}".format(i)) my_reducer = tf.Variable(initial_value=np.ones( (hidden_size, out_size)), dtype=tf.float32, name="loc_redr_{}".format(i)) for r in range(num_ranks): my_reducer = tf.add(my_reducer, inter_output, name="add_loc_{}_{}".format(i, r)) if average_allreduce: local_input = tf.div(my_reducer, num_ranks, name="div_loc_{}".format(i)) else: local_input = my_reducer local_reduced = local_input # NOTE: This assumes that device IDs are numbered the same as ranks gpu_options = tf.GPUOptions(visible_device_list=str(my_rank)) config = tf.ConfigProto(gpu_options=gpu_options) # MPI Session to test allreduce with mpi.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) input_feed = np.ones((batch_size, hidden_size), dtype=np.float32) our_output = input_feed[0][0] spread_var = 100 input_feed = input_feed + my_rank * spread_var my_output = input_feed[0][0] for i in range(stages): curr_feed = my_output + pow(2.0, i + 1.0) my_output = curr_feed * num_ranks + 1 curr_our_feed = our_output + pow(2.0, i + 1.0) if i == 0: sum_ranks = num_ranks * (num_ranks - 1) / 2 our_output = curr_our_feed * num_ranks + \ spread_var * sum_ranks else: our_output = curr_our_feed * num_ranks print("rank {}: My output is {}".format(my_rank, my_output)) my_correct = np.zeros((batch_size, hidden_size), dtype=np.float32) my_correct = my_correct + my_output print("rank {}: Our output is {}".format(my_rank, our_output)) our_correct = np.zeros((batch_size, hidden_size), dtype=np.float32) our_correct = our_correct + our_output for i in range(1000): if i % 100 == 0: print("{}: iter {}".format(my_rank, i), flush=True) feed_dict = {inputs: input_feed} out_all_red, out_loc_red \ = sess.run([all_reduced, local_reduced], feed_dict=feed_dict) if not np.allclose(out_loc_red, my_correct) or \ not np.allclose(out_all_red, our_correct): print("Test incorrect on iter {}".format(i), flush=True) self.dumpFailure(my_rank, out_loc_red, my_correct, out_all_red, our_correct) assert (np.allclose(out_loc_red, my_correct) and np.allclose(out_all_red, our_correct))