Exemple #1
0
def hier_setup():
    os.environ['BLUEFOG_NODES_PER_MACHINE'] = '2'
    bf.init()
    assert bf.size() % 2 == 0
    machine_size = int(bf.size() // 2)
    bf.set_machine_topology(bf.ExponentialGraph(machine_size))
    return bf.rank(), bf.size(), bf.local_rank(), bf.local_size()
Exemple #2
0
    def test_set_topology_fail_with_win_create(self):
        bf.init()
        size = bf.size()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return

        tensor = torch.FloatTensor([1])
        window_name = "win_create_test"
        is_created = bf.win_create(tensor, window_name)
        assert is_created, "bf.win_create do not create window object successfully."

        if size == 1:
            expected_topology = nx.from_numpy_array(np.array([[0.5]]),
                                                    create_using=nx.DiGraph)
        elif size == 2:
            expected_topology = nx.from_numpy_array(np.array([[0, 0.2],
                                                              [0.2, 0]]),
                                                    create_using=nx.DiGraph)
        else:
            expected_topology = RingGraph(size)

        is_set = bf.set_topology(expected_topology)
        assert not is_set, "bf.set_topology do not fail due to win_create."

        topology = bf.load_topology()
        assert isinstance(topology, nx.DiGraph)
        assert IsTopologyEquivalent(topology, ExponentialGraph(size))

        is_freed = bf.win_free()
        assert is_freed, "bf.win_free do not free window object successfully."
Exemple #3
0
    def __init__(self, params, model, num_steps_per_communication):
        super(self.__class__, self).__init__(params)

        # use to control the behavior of win_accumulate dynamically.
        outdegree = len(bf.out_neighbor_ranks())
        self.dst_weights = {
            rank: 1.0 / (outdegree + 1)
            for rank in bf.out_neighbor_ranks()
        }
        self.self_weight = 1.0 / (outdegree + 1)
        self.force_barrier = True

        named_parameters, models = _check_named_parameters(self, model)
        self._models = models
        self._parameter_names = {v: k for k, v in sorted(named_parameters)}
        self._handles = {}  # store parameter -> handle
        self._named_ps_weights = {}
        self._named_extension_parameters = {}
        self._synchronized = False
        self._should_synchronize = True
        self._use_timeline = False
        self._num_steps_per_communication = num_steps_per_communication
        self._pushsum_delay = {
            v: self._num_steps_per_communication
            for _, v in sorted(named_parameters)
        }
        self._timeline_hook_handles = []
        if bf.size() > 1:
            self._register_window()
            self._register_hooks()
def adjust_learning_rate(epoch, batch_idx):
    if epoch < args.warmup_epochs:
        epoch += float(batch_idx + 1) / len(train_loader)
        lr_adj = 1.0 / bf.size() * (epoch * (bf.size() - 1) / args.warmup_epochs + 1)
    elif epoch < 30:
        lr_adj = 1.0
    elif epoch < 60:
        lr_adj = 1e-1
    elif epoch < 80:
        lr_adj = 1e-2
    else:
        lr_adj = 1e-3
    for param_group in optimizer.param_groups:
        param_group["lr"] = (
            args.base_lr * bf.size() * args.batches_per_allreduce * lr_adj
        )
Exemple #5
0
    def test_timeline_push_sum(self):
        # Use win_accumulate to simulate the push-sum algorithm (sync).
        outdegree = len(bf.out_neighbor_ranks())
        indegree = len(bf.in_neighbor_ranks())
        # we append the p at the last of data.
        x = torch.Tensor(
            [bf.rank() / (indegree + 1), 1.0 / bf.size() / (indegree + 1)])

        # Remember we do not create buffer with 0.
        bf.win_create(x, name="x_buff")
        x = bf.win_update_then_collect(name="x_buff")

        for _ in range(10):
            bf.win_accumulate(x,
                              name="x_buff",
                              dst_weights={
                                  rank: 1.0 / (outdegree + 1)
                                  for rank in bf.out_neighbor_ranks()
                              },
                              require_mutex=True)
            x.div_(1 + outdegree)
            x = bf.win_update_then_collect(name="x_buff")

        bf.barrier()
        # Do not forget to sync at last!
        x = bf.win_update_then_collect(name="x_buff")

        file_name = f"{self.temp_file}{bf.rank()}.json"
        with open(file_name, 'r') as tf:
            timeline_text = tf.read()
            assert 'MPI_WIN_ACCUMULATE' in timeline_text, timeline_text
            assert 'ENQUEUE_WIN_ACCUMULATE' in timeline_text, timeline_text

        bf.win_free()
Exemple #6
0
    def test_win_get(self):
        """Test that the window get operation."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # By default, we use exponential two ring topology.
        indegree = int(np.ceil(np.log2(size)))
        neighbor_ranks = [(rank - 2**i) % size
                          for i in range(indegree)]  # in-neighbor
        avg_value = (rank + np.sum(neighbor_ranks)) / float(indegree + 1)

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_get_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)
            bf.win_get(window_name)
            bf.barrier()
            recv_tensor = bf.win_update(window_name, clone=True)

            assert (list(recv_tensor.shape) == [DIM_SIZE] *
                    dim), ("bf.win_get produce wrong shape tensor.")
            assert (recv_tensor.data - avg_value).abs().max() < EPSILON, (
                "bf.win_get produce wrong tensor value " +
                "[{}-{}]!={} at rank {}.".format(
                    recv_tensor.min(), recv_tensor.max(), avg_value, rank))
Exemple #7
0
def InferDestinationFromSourceRanks(
    src_ranks: List[int], construct_adjacency_matrix: bool = False,
) -> Union[List[int], np.array]:
    """Infer the destination ranks from source ranks. This is collective communication call.

    Args:
        src_ranks: A list of destination ranks.
        construct_adjacency_matrix: If true, adjacency matrix will be return instead.
            Element w_{ij} represents the weights sending from node i to node j.
            We use column normalized style, i.e. the sum of receiving weight is 1.

    Raises:
        ValueError: If dst_ranks or src_ranks does not contain integer from 0 to size-1.

    Returns:
        If construct_adjacency_matrix is false, returns the destination ranks list.
        If construct_adjacency_matrix is true, returns the the sodestinationrce ranks
        list and a 2-D numpy array.
    """
    is_valid, error_msg = _check_ranks(src_ranks, bf.rank(), bf.size())
    assert is_valid, f"The format of src_ranks is wrong: {error_msg}"
    return _infer_topo(
        src_ranks,
        transpose=True,
        construct_adjacency_matrix=construct_adjacency_matrix,
    )
Exemple #8
0
    def test_win_update_with_given_weights(self):
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_create_{}_{}".format(dim, dtype)
            is_created = bf.win_create(tensor, window_name)
            assert is_created, "bf.win_create do not create window object successfully."

            # Test simple average rule.
            weight = 1.0 / (len(bf.in_neighbor_ranks()) + 1)
            sync_result = bf.win_update(
                window_name,
                self_weight=weight,
                neighbor_weights={x: weight
                                  for x in bf.in_neighbor_ranks()})
            assert (list(sync_result.shape) == [DIM_SIZE] * dim), (
                "bf.win_update (weighted) produces wrong shape tensor.")
            assert (sync_result.data - rank).abs().max() < EPSILON, (
                "bf.win_update (weighted) produces wrong tensor value " +
                "[{0}-{1}]!={2} at rank {2}.".format(sync_result.min(),
                                                     sync_result.max(), rank))
Exemple #9
0
    def test_win_update_then_collect(self):
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        indegree = int(np.ceil(np.log2(size)))
        expected_result = rank * (indegree + 1)

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_update_collect_{}_{}".format(dim, dtype)

            bf.win_create(tensor, window_name)

            # After the collect ops, the neighbro tensor will become zero.
            # So second win_update_then_collect should produce the same value.
            for _ in range(2):
                collect_tensor = bf.win_update_then_collect(window_name)

                assert (list(collect_tensor.shape) == [DIM_SIZE] * dim), (
                    "bf.win_update_then_collect produces wrong shape tensor.")
                assert (collect_tensor.data - expected_result).abs().max(
                ) < EPSILON, (
                    "bf.win_update_then_collect produces wrong tensor value " +
                    "[{0}-{1}]!={2} at rank {2}.".format(
                        collect_tensor.min(), collect_tensor.max(), rank))
Exemple #10
0
    def test_win_mutex_full(self):
        size = bf.size()
        rank = bf.rank()
        if size <= 2:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn(
                "Skip {} because it only supports test over at least 3 nodes".
                format(fname))
            return
        bf.set_topology(topology_util.FullyConnectedGraph(size))

        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        for dtype in dtypes:
            tensor = torch.FloatTensor([DIM_SIZE]).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_mutex_full_{}".format(dtype)
            bf.win_create(tensor, window_name)

            if rank == 0:
                with bf.win_mutex(window_name, for_self=True):
                    bf.barrier()
                    time.sleep(1.01)
            else:
                bf.barrier()
                t_start = time.time()
                with bf.win_mutex(window_name):
                    time.sleep(0.001)
                t_end = time.time()
                assert (t_end - t_start) > 1, \
                    "The mutex acquire time should be longer than 1 second"
                assert (t_end - t_start) < 2, \
                    "The mutex acquire time should be shorter than 2 second"
Exemple #11
0
def _infer_topo(
    rank_list: List[int], transpose: bool, construct_adjacency_matrix: bool
):
    degree = len(rank_list)
    all_degree_list = bf.allgather(torch.tensor([degree], dtype=torch.int32)).numpy()
    all_rank_list = bf.allgather(torch.tensor(rank_list, dtype=torch.int32)).numpy()
    adjacency_dict = dict()
    displacement = 0
    for i, degree in enumerate(all_degree_list):
        adjacency_dict[i] = sorted(all_rank_list[displacement : displacement + degree])
        displacement += degree

    inv_adjacency_dict = collections.defaultdict(list)
    for k, adj in adjacency_dict.items():
        for v in adj:
            inv_adjacency_dict[v].append(k)
    return_list = inv_adjacency_dict.get(bf.rank())
    if return_list is None:
        return_list = []

    if not construct_adjacency_matrix:
        return return_list

    # construct_adjacency_matrix
    W = np.eye(bf.size())
    for k, adj in adjacency_dict.items():
        W[k, adj] = 1
    if transpose:
        W = W.T

    return return_list, W / W.sum(axis=1)
Exemple #12
0
    def __init__(self, params, model, num_steps_per_communication, pull_style):
        super(self.__class__, self).__init__(params)

        if pull_style:
            self.src_weights = None  # use to control the behavior of win_get dynamically.
        else:
            self.dst_weights = None  # use to control the behavior of win_put dynamically.
        self.force_barrier = False

        named_parameters, models = _check_named_parameters(self, model)
        self._models = models
        self._pull_style = pull_style
        self._parameter_names = {v: k for k, v in sorted(named_parameters)}
        self._handles = {}  # store parameter -> handle
        self._synchronized = False
        self._should_synchronize = True
        self._use_timeline = False
        self._num_steps_per_communication = num_steps_per_communication
        self._bluefog_delay = {
            v: self._num_steps_per_communication
            for _, v in sorted(named_parameters)
        }
        self._timeline_hook_handles = []
        if os.getenv('BLUEFOG_TIMELINE'):
            self.turn_on_timeline()
        if bf.size() > 1:
            self._register_window()
            self._register_hooks()
Exemple #13
0
 def test_bluefog_size(self):
     """Test that the size returned by bf.size() is correct."""
     _, true_size = mpi_env_rank_and_size()
     bf.init()
     size = bf.size()
     # print("Size: ", true_size, size)
     assert true_size == size
Exemple #14
0
    def test_get_win_version_with_win_get(self):
        """Test version window is initialized, updated and cleared correctly with win get."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # By default, we use exponential two ring topology.
        indegree = int(np.ceil(np.log2(size)))
        neighbor_ranks = [(rank - 2**i) % size
                          for i in range(indegree)]  # in-neighbor

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([23] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_version_get_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)
            original_versions = list(bf.get_win_version(window_name).values())
            bf.barrier()
            bf.win_get(window_name)
            bf.barrier()
            versions_after_win_get = list(
                bf.get_win_version(window_name).values())
            bf.win_update(window_name, clone=True)
            versions_after_win_update = list(
                bf.get_win_version(window_name).values())
            neighbor_ranks_number = len(neighbor_ranks)

            zero_number_in_original_versions = len(
                original_versions) - np.count_nonzero(original_versions)
            assert ((zero_number_in_original_versions) == neighbor_ranks_number
                    ), ("version initialization is wrong.")

            zero_number_after_win_update = len(
                versions_after_win_update) - np.count_nonzero(
                    versions_after_win_update)
            assert ((zero_number_after_win_update) == neighbor_ranks_number), (
                "version clear up is wrong.")

            expected_versions_after_win_get = [1] * neighbor_ranks_number

            assert (versions_after_win_get == expected_versions_after_win_get
                    ), ("version after win get is wrong.")

        for dtype, dim in itertools.product(dtypes, dims):
            window_name = "win_version_get_{}_{}".format(dim, dtype)
            is_freed = bf.win_free(window_name)
            assert is_freed, "bf.win_free do not free window object successfully."
Exemple #15
0
    def test_in_out_neighbors_expo2(self):
        bf.init()
        rank = bf.rank()
        size = bf.size()
        assert bf.set_topology(ExponentialGraph(size))
        in_neighobrs = bf.in_neighbor_ranks()
        out_neighbors = bf.out_neighbor_ranks()

        degree = int(np.ceil(np.log2(size)))
        expected_in_neighbors = sorted([(rank - 2**i) % size
                                        for i in range(degree)])
        expected_out_neighbors = sorted([(rank + 2**i) % size
                                         for i in range(degree)])
        assert sorted(in_neighobrs) == expected_in_neighbors
        assert sorted(out_neighbors) == expected_out_neighbors
Exemple #16
0
def problem_setup(net=LinearNet):
    bf.init()
    num_epochs = 50
    batch_size = 128
    num_train_per_node = 1024
    num_test_per_node = 128
    lr = 0.01

    # Setup Problem
    problem_builder = LinearProblemBuilder()
    train_dataset = problem_builder.get_dataset(num_train_per_node)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
    test_dataset = problem_builder.get_dataset(num_test_per_node)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
    # Setup Model
    model = net(problem_builder.input_dim, problem_builder.output_dim)
    assert (
        num_train_per_node*bf.size() >= model.num_parameters
    ), "The number of samples is too small making it an underdetermined system."
    # Setup Optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr*bf.size())
    bf.broadcast_parameters(model.state_dict(), root_rank=0)
    bf.broadcast_optimizer_state(optimizer, root_rank=0)
    return problem_builder, train_dataloader, test_dataloader, model, optimizer, num_epochs
Exemple #17
0
    def test_asscoicated_with_p(self):
        size = bf.size()
        rank = bf.rank()
        if size <= 3:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn(
                "Skip {} because it only supports test over at least 3 nodes".
                format(fname))
            return

        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU and not bf.nccl_built():
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        bf.set_topology(topology_util.RingGraph(size))
        bf.turn_on_win_ops_with_associated_p()
        for dtype, send_rank in itertools.product(dtypes, range(size)):
            tensor = torch.FloatTensor([23]).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_asscoicate_with_p_{}_{}".format(
                dtype, send_rank)
            bf.win_create(tensor, window_name)
            left_neighbor_rank = (send_rank - 1) % size
            right_neighbor_rank = (send_rank + 1) % size
            if rank == send_rank:
                bf.win_accumulate(tensor,
                                  name=window_name,
                                  self_weight=0.5,
                                  dst_weights={
                                      left_neighbor_rank: 0.5,
                                      right_neighbor_rank: 0.5
                                  })
            bf.barrier()
            bf.win_update_then_collect(name=window_name)
            associated_p = bf.win_associated_p(name=window_name)
            if rank == send_rank:
                assert associated_p == 0.5, (
                    "associated_p for sender {} is wrong. Get {}".format(
                        rank, associated_p))
            elif (rank == left_neighbor_rank) or (rank == right_neighbor_rank):
                assert (associated_p - 1.5) < EPSILON, (
                    "associated_p for received neighbor {} is wrong. Get {}".
                    format(rank, associated_p))
            else:
                assert associated_p == 1.0, (
                    "associated_p for untouched node {} is wrong. Get {}".
                    format(rank, associated_p))
        bf.turn_off_win_ops_with_associated_p()
def test_infer_source_from_destination_ranks(topo_func):
    bf.init()
    size = bf.size()
    bf.set_topology(topo_func(size))
    topo = bf.load_topology()
    in_neighbors = bf.in_neighbor_ranks()
    out_neighbors = bf.out_neighbor_ranks()

    # Make the W into average rule.
    expected_W = (nx.to_numpy_array(topo) > 0).astype(float)
    expected_W /= expected_W.sum(axis=0)

    dst_ranks, W = InferSourceFromDestinationRanks(
        dst_ranks=out_neighbors, construct_adjacency_matrix=True)
    assert sorted(dst_ranks) == in_neighbors
    np.testing.assert_allclose(W, expected_W)
Exemple #19
0
 def test_set_and_load_topology(self):
     bf.init()
     size = bf.size()
     if size == 4:
         expected_topology = nx.DiGraph(
             np.array([[1 / 3., 1 / 3., 1 / 3., 0.],
                       [0., 1 / 3., 1 / 3., 1 / 3.],
                       [1 / 3., 0., 1 / 3., 1 / 3.],
                       [1 / 3., 1 / 3., 0., 1 / 3.]]))
     elif size == 1:
         expected_topology = nx.DiGraph(np.array([[1.0]]))
     else:
         expected_topology = ExponentialGraph(size)
     topology = bf.load_topology()
     assert isinstance(topology, nx.DiGraph)
     assert IsTopologyEquivalent(expected_topology, topology)
Exemple #20
0
    def test_win_put_with_varied_tensor_elements(self):
        """Test that the window put operation."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # By default, we use exponential two ring topology.
        indegree = int(np.ceil(np.log2(size)))
        neighbor_ranks = [(rank - 2**i) % size
                          for i in range(indegree)]  # in-neighbor
        avg_value = (rank + np.sum(neighbor_ranks)) / float(indegree + 1)

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            base_tensor = torch.arange(
                DIM_SIZE**dim, dtype=torch.float32).view_as(tensor).div(1000)
            tensor = self.cast_and_place(tensor, dtype)
            base_tensor = self.cast_and_place(base_tensor, dtype)
            tensor = tensor + base_tensor
            window_name = "win_put_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)

            bf.win_put(tensor, window_name)
            bf.barrier()
            sync_result = bf.win_update(window_name)
            assert (list(sync_result.shape) == [DIM_SIZE] * dim), (
                "bf.win_update after win_put produces wrong shape tensor.")
            assert (
                (sync_result - base_tensor).data -
                avg_value).abs().max() < EPSILON, (
                    "bf.win_update after win_put produces wrong tensor value "
                    + "[{}-{}]!={} at rank {}.".format(
                        (sync_result - base_tensor).min(),
                        (sync_result - base_tensor).max(), avg_value, rank))

        time.sleep(0.5)
        for dtype, dim in itertools.product(dtypes, dims):
            window_name = "win_put_{}_{}".format(dim, dtype)
            is_freed = bf.win_free(window_name)
            assert is_freed, "bf.win_free do not free window object successfully."
Exemple #21
0
    def __init__(self,
                 params,
                 model,
                 reduce_type,
                 num_steps_per_communication=1):
        super(self.__class__, self).__init__(params)

        named_parameters, models = _check_named_parameters(self, model)
        # knobs for neighbor communication behavior
        self.self_weight = None
        self.neighbor_weights = None
        self.send_neighbors = None
        self.neighbor_machine_weights = None
        self.send_neighbor_machines = None
        self.enable_topo_check = False

        self._models = models
        self._parameter_names = {v: k for k, v in sorted(named_parameters)}
        self._handles = {}
        self._requires_update = set()
        self._synchronized = False
        self._should_synchronize = True
        self._timeline_hook_handles = []
        self._use_timeline = False
        self._num_steps_per_communication = num_steps_per_communication
        self._reduce_type_str = reduce_type
        # _reduce_method: 0 for allreduce, and 1 for neighbor_allreduce
        if self._reduce_type_str == "allreduce":
            self._reduce_method = 0
        elif self._reduce_type_str == "neighbor.allreduce":
            self._reduce_method = 1
        elif self._reduce_type_str == "hierarchical.neighbor.allreduce":
            self._reduce_method = 2
        else:
            raise ValueError(
                "Unknown reduce type for internal class _DistributedReduceOptimizer"
            )

        self._reduce_delay = {
            v: self._num_steps_per_communication
            for _, v in sorted(named_parameters)
        }
        if os.getenv('BLUEFOG_TIMELINE'):
            self.turn_on_timeline()
        if bf.size() > 1:
            self._register_hooks()
Exemple #22
0
    def test_asscoicated_with_p_random_test(self):
        size = bf.size()
        rank = bf.rank()
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        # Current, nccl version hasn't supported the associated with p yet.
        if TEST_ON_GPU and not bf.nccl_built():
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1]
        bf.turn_on_win_ops_with_associated_p()
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([23] * dim)).fill_(1)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_asscoicate_with_p_random_{}_{}".format(
                dim, dtype)
            bf.win_create(tensor, window_name, zero_init=True)
            for _ in range(10):
                random_weights = np.random.rand(
                    len(bf.out_neighbor_ranks()) + 1)
                random_weights /= random_weights.sum()
                self_weight = random_weights[-1]
                dst_weights = {
                    r: random_weights[i]
                    for i, r in enumerate(bf.out_neighbor_ranks())
                }
                bf.win_put(tensor,
                           self_weight=self_weight,
                           dst_weights=dst_weights,
                           name=window_name,
                           require_mutex=True)
                bf.win_update(name=window_name, require_mutex=True)
                bf.win_accumulate(tensor,
                                  name=window_name,
                                  require_mutex=True,
                                  self_weight=self_weight,
                                  dst_weights=dst_weights)
                bf.win_update_then_collect(name=window_name)
            bf.barrier()
            bf.win_update_then_collect(name=window_name)
            associated_p = bf.win_associated_p(name=window_name)
            # Because the associated p should operate the same as tensor always
            # the following assert should be true no matter what order is excuted.
            assert abs(associated_p - tensor.data[0]) < EPSILON

        bf.turn_off_win_ops_with_associated_p()
Exemple #23
0
    def test_in_out_neighbors_biring(self):
        bf.init()
        rank = bf.rank()
        size = bf.size()
        assert bf.set_topology(RingGraph(size))
        in_neighobrs = bf.in_neighbor_ranks()
        out_neighbors = bf.out_neighbor_ranks()

        expected_in_neighbors = list(
            set(map(lambda x: x % size, [rank - 1, rank + 1])))
        expected_out_neighbors = list(
            set(map(lambda x: x % size, [rank - 1, rank + 1])))

        if size <= 1:
            expected_in_neighbors = []
            expected_out_neighbors = []

        assert sorted(in_neighobrs) == expected_in_neighbors
        assert sorted(out_neighbors) == expected_out_neighbors
Exemple #24
0
    def test_win_put_with_given_destination(self):
        """Test that the window put operation with given destination."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # By default, we use exponential two ring topology.
        indegree = int(np.ceil(np.log2(size)))
        # We use given destination to form a (right-)ring.
        avg_value = (rank * indegree + 1.23 *
                     ((rank - 1) % size)) / float(indegree + 1)

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_put_given_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)
            bf.win_put(tensor,
                       window_name,
                       dst_weights={(rank + 1) % size: 1.23})
            bf.barrier()
            sync_result = bf.win_update(window_name)
            assert (list(sync_result.shape) == [DIM_SIZE] * dim), (
                "bf.win_update after win_put given destination produces wrong shape tensor."
            )
            assert (sync_result.data - avg_value).abs().max() < EPSILON, (
                "bf.win_update after win_put given destination produces wrong tensor value "
                + "[{}-{}]!={} at rank {}.".format(
                    sync_result.min(), sync_result.max(), avg_value, rank))

        time.sleep(0.5)
        for dtype, dim in itertools.product(dtypes, dims):
            window_name = "win_put_given_{}_{}".format(dim, dtype)
            is_freed = bf.win_free(window_name)
            assert is_freed, "bf.win_free do not free window object successfully."
Exemple #25
0
    def test_win_free_all(self):
        size = bf.size()
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_create_{}_{}".format(dim, dtype)
            is_created = bf.win_create(tensor, window_name)
            assert is_created, "bf.win_create do not create window object successfully."

        is_freed = bf.win_free()
        assert is_freed, "bf.win_free do not free window object successfully."
Exemple #26
0
    def __init__(self, params, lr, L, communication_type):
        '''
        lr: Learning rate
        L: Number of batches
        '''
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        defaults = dict(lr=lr, L=L)
        super(ExactDiff, self).__init__(params, defaults)

        self._communication_type = communication_type
        self.lr = lr
        self.L = L
        self._q = bf.size()
        self._states = {}
        for groups in self.param_groups:
            for p in groups['params']:
                self._states[p] = {
                    'psi': torch.clone(p),
                    'phi': torch.zeros_like(p),
                    'handle': None
                }
Exemple #27
0
    def __init__(self, params, model, backward_passes_per_step=1):
        super(self.__class__, self).__init__(params)

        named_parameters, models = _check_named_parameters(self, model)
        self._models = models
        self._parameter_names = {v: k for k, v in sorted(named_parameters)}
        self._handles = {}
        self._grad_accs = []
        self._requires_update = set()
        self._synchronized = False
        self._should_synchronize = True
        self._timeline_hook_handles = []
        self._use_timeline = False
        self._backward_passes_per_step = backward_passes_per_step
        self._allreduce_delay = {
            v: self._backward_passes_per_step
            for _, v in sorted(named_parameters)
        }
        if os.getenv('BLUEFOG_TIMELINE'):
            self.turn_on_timeline()
        if bf.size() > 1:
            self._register_hooks()
Exemple #28
0
    def test_win_mutex_given_ranks(self):
        size = bf.size()
        rank = bf.rank()
        if size < 4:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn(
                "Skip {} because it only supports test above 4 nodes".format(
                    fname))
            return

        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        for dtype in dtypes:
            tensor = torch.FloatTensor([DIM_SIZE]).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_mutex_given_ranks_{}".format(dtype)
            bf.win_create(tensor, window_name)
            if rank == 0:
                with bf.win_mutex(window_name, for_self=True, ranks=[1]):
                    bf.barrier()
                    time.sleep(1.01)
            elif rank == 1:
                bf.barrier()
                t_start = time.time()
                with bf.win_mutex(window_name, ranks=[0]):
                    time.sleep(0.001)
                t_end = time.time()
                assert (t_end - t_start) > 1
            elif rank == 2:
                bf.barrier()
                t_start = time.time()
                with bf.win_mutex(window_name, ranks=[0]):
                    time.sleep(0.001)
                t_end = time.time()
                assert (t_end - t_start) < 0.1
            else:
                bf.barrier()
Exemple #29
0
    def test_win_accumulate_with_given_destination(self):
        """Test that the window accumulate operation with given destination."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        avg_value = rank + ((rank - 1) % size) * 1.23 / 2.0

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_accumulate_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)
            bf.win_accumulate(tensor,
                              window_name,
                              dst_weights={(rank + 1) % size: 1.23})

            bf.barrier()
            sync_result = bf.win_update(window_name,
                                        self_weight=0.5,
                                        neighbor_weights={
                                            (rank - 1) % size: 0.5
                                        })

            assert (list(sync_result.shape) == [DIM_SIZE] * dim), (
                "bf.win_update after win_accmulate given destination produces wrong shape tensor."
            )
            assert (sync_result.data - avg_value).abs().max() < EPSILON, (
                "bf.win_update after win_accmulate given destination produces wrong tensor value "
                + "[{}-{}]!={} at rank {}.".format(
                    sync_result.min(), sync_result.max(), avg_value, rank))
Exemple #30
0
    def test_win_create_and_sync_and_free(self):
        """Test that the window create and free objects correctly."""
        size = bf.size()
        rank = bf.rank()
        # OpenMPI implementation seems won't allow win_create on size 1.
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return

        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # By default, we use exponential two ring topology.
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_create_{}_{}".format(dim, dtype)
            is_created = bf.win_create(tensor, window_name)
            assert is_created, "bf.win_create do not create window object successfully."

            sync_result = bf.win_update(window_name)
            assert (list(sync_result.shape) == [DIM_SIZE] *
                    dim), ("bf.win_update produce wrong shape tensor.")
            assert (sync_result.data.min() == rank), (
                "bf.win_update produces wrong tensor value " +
                "{0}!={1} at rank {1}.".format(sync_result.data.min(), rank))
            assert (sync_result.data.max() == rank), (
                "bf.win_update produces wrong tensor value " +
                "{0}!={1} at rank {1}.".format(sync_result.data.max(), rank))

        for dtype, dim in itertools.product(dtypes, dims):
            window_name = "win_create_{}_{}".format(dim, dtype)
            is_freed = bf.win_free(window_name)
            assert is_freed, "bf.win_free do not free window object successfully."