def cast_and_place(tensor, dtype): if dtype.is_cuda: if bf.nccl_built() and bf.local_size() > torch.cuda.device_count(): raise EnvironmentError( "Cannot run number of processes in one machine more than GPU device count" " in NCCL environment") return tensor.cuda(bf.rank() % torch.cuda.device_count()).type(dtype) return tensor.type(dtype)
def pin_model_to_device(device, model): isCUDA = device == "GPU" if isCUDA: # Bluefog: pin GPU to local rank. device_id = (bf.local_rank() if bf.nccl_built() else bf.local_rank() % torch.cuda.device_count()) torch.cuda.set_device(device_id) model.cuda() return isCUDA
def test_asscoicated_with_p(self): size = bf.size() rank = bf.rank() if size <= 3: fname = inspect.currentframe().f_code.co_name warnings.warn( "Skip {} because it only supports test over at least 3 nodes". format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU and not bf.nccl_built(): dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] bf.set_topology(topology_util.RingGraph(size)) bf.turn_on_win_ops_with_associated_p() for dtype, send_rank in itertools.product(dtypes, range(size)): tensor = torch.FloatTensor([23]).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_asscoicate_with_p_{}_{}".format( dtype, send_rank) bf.win_create(tensor, window_name) left_neighbor_rank = (send_rank - 1) % size right_neighbor_rank = (send_rank + 1) % size if rank == send_rank: bf.win_accumulate(tensor, name=window_name, self_weight=0.5, dst_weights={ left_neighbor_rank: 0.5, right_neighbor_rank: 0.5 }) bf.barrier() bf.win_update_then_collect(name=window_name) associated_p = bf.win_associated_p(name=window_name) if rank == send_rank: assert associated_p == 0.5, ( "associated_p for sender {} is wrong. Get {}".format( rank, associated_p)) elif (rank == left_neighbor_rank) or (rank == right_neighbor_rank): assert (associated_p - 1.5) < EPSILON, ( "associated_p for received neighbor {} is wrong. Get {}". format(rank, associated_p)) else: assert associated_p == 1.0, ( "associated_p for untouched node {} is wrong. Get {}". format(rank, associated_p)) bf.turn_off_win_ops_with_associated_p()
def test_asscoicated_with_p_random_test(self): size = bf.size() rank = bf.rank() dtypes = [torch.FloatTensor, torch.DoubleTensor] # Current, nccl version hasn't supported the associated with p yet. if TEST_ON_GPU and not bf.nccl_built(): dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1] bf.turn_on_win_ops_with_associated_p() for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([23] * dim)).fill_(1) tensor = self.cast_and_place(tensor, dtype) window_name = "win_asscoicate_with_p_random_{}_{}".format( dim, dtype) bf.win_create(tensor, window_name, zero_init=True) for _ in range(10): random_weights = np.random.rand( len(bf.out_neighbor_ranks()) + 1) random_weights /= random_weights.sum() self_weight = random_weights[-1] dst_weights = { r: random_weights[i] for i, r in enumerate(bf.out_neighbor_ranks()) } bf.win_put(tensor, self_weight=self_weight, dst_weights=dst_weights, name=window_name, require_mutex=True) bf.win_update(name=window_name, require_mutex=True) bf.win_accumulate(tensor, name=window_name, require_mutex=True, self_weight=self_weight, dst_weights=dst_weights) bf.win_update_then_collect(name=window_name) bf.barrier() bf.win_update_then_collect(name=window_name) associated_p = bf.win_associated_p(name=window_name) # Because the associated p should operate the same as tensor always # the following assert should be true no matter what order is excuted. assert abs(associated_p - tensor.data[0]) < EPSILON bf.turn_off_win_ops_with_associated_p()
for r in recv_neighbors } self_weight = 1 / (len(recv_neighbors) + 1) x = bf.neighbor_allreduce(x, name='x', self_weight=self_weight, neighbor_weights=neighbor_weights, send_neighbors=send_neighbors, enable_topo_check=False) mse.append(torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2)) else: outdegree = len(bf.out_neighbor_ranks()) indegree = len(bf.in_neighbor_ranks()) if not bf.nccl_built(): # NCCL do not support associated P yet. bf.turn_on_win_ops_with_associated_p() bf.win_create(x, name="x", zero_init=True) for i in range(args.max_iters): if args.enable_dynamic_topology: num_out_neighbors = len(bf.out_neighbor_ranks()) sent_neighbor = bf.out_neighbor_ranks()[i % num_out_neighbors] dst_weights = {sent_neighbor: 0.5} self_weight = 0.5 else: dst_weights = { rank: 1.0 / (outdegree + 1) for rank in bf.out_neighbor_ranks() } self_weight = 1 / (1 + outdegree)
parser.add_argument('--batch_size', type=int, default=100, help="batch size (default: 100).") parser.add_argument('--seed', type=int, default=3, help='set seed (default: 3).') parser.add_argument('--save_name', type=str, required=True, help='The file_postfix to save log') args = parser.parse_args() cudnn.benchmark = True cudnn.enabled = True torch.manual_seed(args.seed) np.random.seed(args.seed) bf.init() device_id = bf.local_rank() if bf.nccl_built() else bf.local_rank() % torch.cuda.device_count() torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) kwargs = {"num_workers": 4, "pin_memory": True} # load the data if args.dataset == "MNIST": train_set, test_set = MNIST_dataset_flat_dist(bf.rank()) NN_model = MNIST_two_layers elif args.dataset == "MNIST_Conv": train_set, test_set = MNIST_dataset_dist(bf.rank()) NN_model = LeNet elif args.dataset == "CIFAR10": train_set, test_set = CIFAR10_dataset_dist(bf.rank()) NN_model = vgg11