Ejemplo n.º 1
0
    def test_mpi_allreduce_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different rank or dimension."""
        with mpi.Session() as session:
            rank = session.run(mpi.rank())
            size = session.run(mpi.size())

            # This test does not apply if there is only one worker.
            if size == 1:
                return

            # Same rank, different dimension
            tf.set_random_seed(1234)
            dims = [17 + rank] * 3
            tensor = tf.random_uniform(dims, -1.0, 1.0)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(mpi.allreduce(tensor))

            # Same number of elements, different rank
            tf.set_random_seed(1234)
            if rank == 0:
                dims = [17, 23 * 57]
            else:
                dims = [17, 23, 57]
            tensor = tf.random_uniform(dims, -1.0, 1.0)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(mpi.allreduce(tensor))
Ejemplo n.º 2
0
  def test_mpi_allreduce_error(self):
    """Test that the allreduce raises an error if different ranks try to
    send tensors of different rank or dimension."""
    with self.test_session() as session:
      rank = session.run(mpi.rank())
      size = session.run(mpi.size())

      # This test does not apply if there is only one worker.
      if size == 1:
        return

      # Same rank, different dimension
      tf.set_random_seed(1234)
      dims = [17 + rank] * 3
      tensor = tf.random_uniform(dims, -1.0, 1.0)
      with self.assertRaises(tf.errors.FailedPreconditionError):
        session.run(mpi.allreduce(tensor))

      # Same number of elements, different rank
      tf.set_random_seed(1234)
      if rank == 0:
        dims = [17, 23 * 57]
      else:
        dims = [17, 23, 57]
      tensor = tf.random_uniform(dims, -1.0, 1.0)
      with self.assertRaises(tf.errors.FailedPreconditionError):
        session.run(mpi.allreduce(tensor))
Ejemplo n.º 3
0
    def test_mpi_allreduce_cpu(self):
        """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors."""
        with mpi.Session() as session:
            size = session.run(mpi.size())

            dtypes = [tf.int32, tf.float32]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                tf.set_random_seed(1234)
                tensor = tf.random_uniform([17] * dim, -100, 100,
                                           dtype=dtype)
                summed = mpi.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

                # Threshold for floating point equality depends on number of
                # ranks, since we're comparing against precise multiplication.
                if size <= 3:
                    threshold = 0
                elif size < 10:
                    threshold = 1e-4
                elif size < 15:
                    threshold = 5e-4
                else:
                    break

                diff = session.run(max_difference)
                self.assertTrue(diff <= threshold,
                                "mpi.allreduce produces incorrect results")
Ejemplo n.º 4
0
  def test_mpi_allreduce_cpu(self):
    """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors."""
    with self.test_session() as session:
      size = session.run(mpi.size())

      dtypes = [tf.int32, tf.float32]
      dims = [1, 2, 3]
      for dtype, dim in itertools.product(dtypes, dims):
        tf.set_random_seed(1234)
        tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype)
        summed = mpi.allreduce(tensor, average=False)
        multiplied = tensor * size
        max_difference = tf.reduce_max(tf.abs(summed - multiplied))

        # Threshold for floating point equality depends on number of
        # ranks, since we're comparing against precise multiplication.
        if size <= 3:
          threshold = 0
        elif size < 10:
          threshold = 1e-4
        elif size < 15:
          threshold = 5e-4
        else:
          break

        diff = session.run(max_difference)
        self.assertTrue(diff <= threshold,
                        "mpi.allreduce produces incorrect results")
Ejemplo n.º 5
0
    def test_mpi_allgather(self):
        # Get MPI rank
        my_rank = int(os.environ['PMI_RANK'])
        num_ranks = int(os.environ['PMI_SIZE'])

        indices_per_rank = 100
        tensor_width = 10

        # Create IndexedSlices for each rank, some with overlapping indices.
        to_gather_indices = []
        to_gather_values = []
        to_gather = []
        for rank_id in range(num_ranks):
            indices = []
            values = []
            my_multiple = rank_id + 1
            current_index = my_multiple
            for i in range(indices_per_rank):
                indices.append(current_index)
                ones_tensor = tf.ones([tensor_width])
                values.append(
                    tf.multiply(
                        ones_tensor,
                        tf.fill(ones_tensor.get_shape(),
                                float(current_index))))
                current_index += my_multiple
            concat_ind = tf.stack(indices)
            concat_vals = tf.stack(values)
            to_gather_indices.append(concat_ind)
            to_gather_values.append(concat_vals)
            to_gather.append(tf.IndexedSlices(concat_vals, concat_ind))

        # Collect the local IndexedSlices (indices and values) to create
        # correct IndexedSlices output.
        correct_gather_indices = tf.concat(to_gather_indices, 0)
        correct_gather_values = tf.concat(to_gather_values, 0)
        correct_gather = tf.IndexedSlices(correct_gather_values,
                                          correct_gather_indices)

        all_gather = mpi.allreduce(to_gather[my_rank], average_allgather)

        # NOTE: This assumes that device IDs are numbered the same as ranks.
        gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
        config = tf.ConfigProto(gpu_options=gpu_options)

        # MPI Session to test allgather.
        with mpi.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())

            all_gathered, local_gathered = sess.run(
                [all_gather, correct_gather])

            # Compare all_gathered with local_gathered.
            self.checkAllgather(num_ranks, all_gathered, local_gathered)
Ejemplo n.º 6
0
  def test_mpi_allgather(self):
    # Get MPI rank
    my_rank = int(os.environ['PMI_RANK'])
    num_ranks = int(os.environ['PMI_SIZE'])

    indices_per_rank = 100
    tensor_width = 10

    # Create IndexedSlices for each rank, some with overlapping indices.
    to_gather_indices = []
    to_gather_values = []
    to_gather = []
    for rank_id in range(num_ranks):
      indices = []
      values = []
      my_multiple = rank_id + 1
      current_index = my_multiple
      for i in range(indices_per_rank):
        indices.append(current_index)
        ones_tensor = tf.ones([tensor_width])
        values.append(tf.multiply(ones_tensor,
                                  tf.fill(ones_tensor.get_shape(),
                                          float(current_index))))
        current_index += my_multiple
      concat_ind = tf.stack(indices)
      concat_vals = tf.stack(values)
      to_gather_indices.append(concat_ind)
      to_gather_values.append(concat_vals)
      to_gather.append(tf.IndexedSlices(concat_vals, concat_ind))

    # Collect the local IndexedSlices (indices and values) to create
    # correct IndexedSlices output.
    correct_gather_indices = tf.concat(to_gather_indices, 0)
    correct_gather_values = tf.concat(to_gather_values, 0)
    correct_gather = tf.IndexedSlices(correct_gather_values,
                                      correct_gather_indices)

    all_gather = mpi.allreduce(to_gather[my_rank], average_allgather)

    # NOTE: This assumes that device IDs are numbered the same as ranks.
    gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
    config = tf.ConfigProto(gpu_options=gpu_options)

    # MPI Session to test allgather.
    with mpi.Session(config=config) as sess:
      sess.run(tf.global_variables_initializer())

      all_gathered, local_gathered = sess.run([all_gather, correct_gather])

      # Compare all_gathered with local_gathered.
      self.checkAllgather(num_ranks, all_gathered, local_gathered)
Ejemplo n.º 7
0
  def test_mpi_allreduce_type_error(self):
    """Test that the allreduce raises an error if different ranks try to
    send tensors of different type."""
    with self.test_session() as session:
      rank = session.run(mpi.rank())
      size = session.run(mpi.size())

      # This test does not apply if there is only one worker.
      if size == 1:
        return

      # Same rank, different dimension
      dims = [17] * 3
      tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32)
      with self.assertRaises(tf.errors.FailedPreconditionError):
        session.run(mpi.allreduce(tensor))
Ejemplo n.º 8
0
    def test_mpi_allreduce_type_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different type."""
        with mpi.Session() as session:
            rank = session.run(mpi.rank())
            size = session.run(mpi.size())

            # This test does not apply if there is only one worker.
            if size == 1:
                return

            # Same rank, different dimension
            dims = [17] * 3
            tensor = tf.ones(dims,
                             dtype=tf.int32 if rank % 2 == 0 else tf.float32)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(mpi.allreduce(tensor))
Ejemplo n.º 9
0
    def test_mpi_allreduce_gpu(self):
        """Test that the allreduce works on GPUs.

        This test will crash badly if used with an MPI implementation that does
        not support GPU memory transfers directly, as it will call MPI_Send on
        a GPU data pointer."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        no_gpus = tf.GPUOptions(visible_device_list="")
        cpu_config = tf.ConfigProto(gpu_options=no_gpus)
        with mpi.Session(config=cpu_config) as session:
            local_rank = session.run(mpi.local_rank())

        one_gpu = tf.GPUOptions(visible_device_list=str(local_rank))
        gpu_config = tf.ConfigProto(gpu_options=one_gpu)
        with mpi.Session(config=gpu_config) as session:
            size = session.run(mpi.size())

            dtype = tf.float32
            dim = 3
            with tf.device("/gpu:0"):
                tf.set_random_seed(1234)
                tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype)
                summed = mpi.allreduce(tensor, average=False)
                multiplied = tensor * size
                max_difference = tf.reduce_max(tf.abs(summed - multiplied))

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                return

            diff = session.run(max_difference)
            self.assertTrue(diff <= threshold,
                            "mpi.allreduce on GPU produces incorrect results")
Ejemplo n.º 10
0
  def test_mpi_allreduce_gpu(self):
    """Test that the allreduce works on GPUs.

    This test will crash badly if used with an MPI implementation that does
    not support GPU memory transfers directly, as it will call MPI_Send on
    a GPU data pointer."""
    # Only do this test if there are GPUs available.
    if not tf.test.is_gpu_available(cuda_only=True):
      return

    no_gpus = tf.GPUOptions(visible_device_list="")
    cpu_config = tf.ConfigProto(gpu_options=no_gpus)
    with self.test_session(config=cpu_config) as session:
      local_rank = session.run(mpi.local_rank())

    one_gpu = tf.GPUOptions(visible_device_list=str(local_rank))
    gpu_config = tf.ConfigProto(gpu_options=one_gpu)
    with self.test_session(config=gpu_config) as session:
      size = session.run(mpi.size())

      dtype = tf.float32
      dim = 3
      with tf.device("/gpu:0"):
        tf.set_random_seed(1234)
        tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype)
        summed = mpi.allreduce(tensor, average=False)
        multiplied = tensor * size
        max_difference = tf.reduce_max(tf.abs(summed - multiplied))

      # Threshold for floating point equality depends on number of
      # ranks, since we're comparing against precise multiplication.
      if size <= 3:
        threshold = 0
      elif size < 10:
        threshold = 1e-4
      elif size < 15:
        threshold = 5e-4
      else:
        return

      diff = session.run(max_difference)
      self.assertTrue(diff <= threshold,
                      "mpi.allreduce on GPU produces incorrect results")
Ejemplo n.º 11
0
  def test_mpi_allreduce(self):
    # Get MPI rank
    my_rank = int(os.environ['PMI_RANK'])
    num_ranks = int(os.environ['PMI_SIZE'])

    stages = 13
    batch_size = 1331
    hidden_size = batch_size
    out_size = batch_size

    # Input placeholder (batch_size x hidden) - init to 1s
    inputs = tf.placeholder(tf.float32, shape=(batch_size, hidden_size),
                            name="Input")

    # Large matrices (hidden x out_dim) - init random
    weights = []
    for i in range(stages):
      initer = tf.constant_initializer(pow(2.0, i + 1.0))
      weights.append(tf.get_variable("weights_{}".format(i),
                                     shape=(hidden_size, out_size),
                                     dtype=tf.float32,
                                     initializer=initer))

    # Calculate output through dependent allreduces
    stage_input = inputs
    for i in range(stages):
      inter_output = tf.add(stage_input, weights[i],
                            name="add_red_{}".format(i))
      stage_input = mpi.allreduce(inter_output,
                                  average=average_allreduce)

    all_reduced = stage_input

    # Local reduced output for verification
    local_input = inputs
    for i in range(stages):
      inter_output = tf.add(local_input, weights[i],
                            name="addin_loc_{}".format(i))
      my_reducer = tf.Variable(initial_value=np.ones((hidden_size, out_size)),
                               dtype=tf.float32, name="loc_redr_{}".format(i))
      for r in range(num_ranks):
        my_reducer = tf.add(my_reducer, inter_output,
                            name="add_loc_{}_{}".format(i, r))
      if average_allreduce:
        local_input = tf.div(my_reducer, num_ranks,
                             name="div_loc_{}".format(i))
      else:
        local_input = my_reducer

    local_reduced = local_input

    # NOTE: This assumes that device IDs are numbered the same as ranks
    gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
    config = tf.ConfigProto(gpu_options=gpu_options)

    # MPI Session to test allreduce
    with mpi.Session(config=config) as sess:
      sess.run(tf.global_variables_initializer())

      input_feed = np.ones((batch_size, hidden_size), dtype=np.float32)
      our_output = input_feed[0][0]
      spread_var = 100
      input_feed = input_feed + my_rank * spread_var
      my_output = input_feed[0][0]
      for i in range(stages):
        curr_feed = my_output + pow(2.0, i + 1.0)
        my_output = curr_feed * num_ranks + 1
        curr_our_feed = our_output + pow(2.0, i + 1.0)
        if i == 0:
          sum_ranks = num_ranks * (num_ranks - 1) / 2
          our_output = curr_our_feed * num_ranks + \
            spread_var * sum_ranks
        else:
          our_output = curr_our_feed * num_ranks

      print("rank {}: My output is {}".format(my_rank, my_output))
      my_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
      my_correct = my_correct + my_output
      print("rank {}: Our output is {}".format(my_rank, our_output))
      our_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
      our_correct = our_correct + our_output

      for i in range(1000):
        if i % 100 == 0:
          print("{}: iter {}".format(my_rank, i), flush=True)
        feed_dict = {inputs: input_feed}
        out_all_red, out_loc_red \
          = sess.run([all_reduced, local_reduced],
                     feed_dict=feed_dict)

        if not np.allclose(out_loc_red, my_correct) or \
           not np.allclose(out_all_red, our_correct):
          print("Test incorrect on iter {}".format(i), flush=True)
          self.dumpFailure(my_rank, out_loc_red, my_correct, out_all_red,
                           our_correct)
          assert(np.allclose(out_loc_red, my_correct) and
                 np.allclose(out_all_red, our_correct))
Ejemplo n.º 12
0
    def test_mpi_allreduce(self):
        # Get MPI rank
        my_rank = int(os.environ['PMI_RANK'])
        num_ranks = int(os.environ['PMI_SIZE'])

        stages = 13
        batch_size = 1331
        hidden_size = batch_size
        out_size = batch_size

        # Input placeholder (batch_size x hidden) - init to 1s
        inputs = tf.placeholder(tf.float32,
                                shape=(batch_size, hidden_size),
                                name="Input")

        # Large matrices (hidden x out_dim) - init random
        weights = []
        for i in range(stages):
            initer = tf.constant_initializer(pow(2.0, i + 1.0))
            weights.append(
                tf.get_variable("weights_{}".format(i),
                                shape=(hidden_size, out_size),
                                dtype=tf.float32,
                                initializer=initer))

        # Calculate output through dependent allreduces
        stage_input = inputs
        for i in range(stages):
            inter_output = tf.add(stage_input,
                                  weights[i],
                                  name="add_red_{}".format(i))
            stage_input = mpi.allreduce(inter_output,
                                        average=average_allreduce)

        all_reduced = stage_input

        # Local reduced output for verification
        local_input = inputs
        for i in range(stages):
            inter_output = tf.add(local_input,
                                  weights[i],
                                  name="addin_loc_{}".format(i))
            my_reducer = tf.Variable(initial_value=np.ones(
                (hidden_size, out_size)),
                                     dtype=tf.float32,
                                     name="loc_redr_{}".format(i))
            for r in range(num_ranks):
                my_reducer = tf.add(my_reducer,
                                    inter_output,
                                    name="add_loc_{}_{}".format(i, r))
            if average_allreduce:
                local_input = tf.div(my_reducer,
                                     num_ranks,
                                     name="div_loc_{}".format(i))
            else:
                local_input = my_reducer

        local_reduced = local_input

        # NOTE: This assumes that device IDs are numbered the same as ranks
        gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
        config = tf.ConfigProto(gpu_options=gpu_options)

        # MPI Session to test allreduce
        with mpi.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())

            input_feed = np.ones((batch_size, hidden_size), dtype=np.float32)
            our_output = input_feed[0][0]
            spread_var = 100
            input_feed = input_feed + my_rank * spread_var
            my_output = input_feed[0][0]
            for i in range(stages):
                curr_feed = my_output + pow(2.0, i + 1.0)
                my_output = curr_feed * num_ranks + 1
                curr_our_feed = our_output + pow(2.0, i + 1.0)
                if i == 0:
                    sum_ranks = num_ranks * (num_ranks - 1) / 2
                    our_output = curr_our_feed * num_ranks + \
                      spread_var * sum_ranks
                else:
                    our_output = curr_our_feed * num_ranks

            print("rank {}: My output is {}".format(my_rank, my_output))
            my_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
            my_correct = my_correct + my_output
            print("rank {}: Our output is {}".format(my_rank, our_output))
            our_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
            our_correct = our_correct + our_output

            for i in range(1000):
                if i % 100 == 0:
                    print("{}: iter {}".format(my_rank, i), flush=True)
                feed_dict = {inputs: input_feed}
                out_all_red, out_loc_red \
                  = sess.run([all_reduced, local_reduced],
                             feed_dict=feed_dict)

                if not np.allclose(out_loc_red, my_correct) or \
                   not np.allclose(out_all_red, our_correct):
                    print("Test incorrect on iter {}".format(i), flush=True)
                    self.dumpFailure(my_rank, out_loc_red, my_correct,
                                     out_all_red, our_correct)
                    assert (np.allclose(out_loc_red, my_correct)
                            and np.allclose(out_all_red, our_correct))