Esempi in Python per nccl_built, esempi in Python per bluefog.torch.nccl_built

Esempio n. 1

0

Mostra file

def cast_and_place(tensor, dtype):
    if dtype.is_cuda:
        if bf.nccl_built() and bf.local_size() > torch.cuda.device_count():
            raise EnvironmentError(
                "Cannot run number of processes in one machine more than GPU device count"
                " in NCCL environment")
        return tensor.cuda(bf.rank() % torch.cuda.device_count()).type(dtype)
    return tensor.type(dtype)

Esempio n. 2

0

Mostra file

def pin_model_to_device(device, model):
    isCUDA = device == "GPU"
    if isCUDA:
        # Bluefog: pin GPU to local rank.
        device_id = (bf.local_rank() if bf.nccl_built() else
                     bf.local_rank() % torch.cuda.device_count())
        torch.cuda.set_device(device_id)
        model.cuda()
    return isCUDA

Esempio n. 3

0

Mostra file

    def test_asscoicated_with_p(self):
        size = bf.size()
        rank = bf.rank()
        if size <= 3:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn(
                "Skip {} because it only supports test over at least 3 nodes".
                format(fname))
            return

        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU and not bf.nccl_built():
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        bf.set_topology(topology_util.RingGraph(size))
        bf.turn_on_win_ops_with_associated_p()
        for dtype, send_rank in itertools.product(dtypes, range(size)):
            tensor = torch.FloatTensor([23]).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_asscoicate_with_p_{}_{}".format(
                dtype, send_rank)
            bf.win_create(tensor, window_name)
            left_neighbor_rank = (send_rank - 1) % size
            right_neighbor_rank = (send_rank + 1) % size
            if rank == send_rank:
                bf.win_accumulate(tensor,
                                  name=window_name,
                                  self_weight=0.5,
                                  dst_weights={
                                      left_neighbor_rank: 0.5,
                                      right_neighbor_rank: 0.5
                                  })
            bf.barrier()
            bf.win_update_then_collect(name=window_name)
            associated_p = bf.win_associated_p(name=window_name)
            if rank == send_rank:
                assert associated_p == 0.5, (
                    "associated_p for sender {} is wrong. Get {}".format(
                        rank, associated_p))
            elif (rank == left_neighbor_rank) or (rank == right_neighbor_rank):
                assert (associated_p - 1.5) < EPSILON, (
                    "associated_p for received neighbor {} is wrong. Get {}".
                    format(rank, associated_p))
            else:
                assert associated_p == 1.0, (
                    "associated_p for untouched node {} is wrong. Get {}".
                    format(rank, associated_p))
        bf.turn_off_win_ops_with_associated_p()

Esempio n. 4

0

Mostra file

    def test_asscoicated_with_p_random_test(self):
        size = bf.size()
        rank = bf.rank()
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        # Current, nccl version hasn't supported the associated with p yet.
        if TEST_ON_GPU and not bf.nccl_built():
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1]
        bf.turn_on_win_ops_with_associated_p()
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([23] * dim)).fill_(1)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_asscoicate_with_p_random_{}_{}".format(
                dim, dtype)
            bf.win_create(tensor, window_name, zero_init=True)
            for _ in range(10):
                random_weights = np.random.rand(
                    len(bf.out_neighbor_ranks()) + 1)
                random_weights /= random_weights.sum()
                self_weight = random_weights[-1]
                dst_weights = {
                    r: random_weights[i]
                    for i, r in enumerate(bf.out_neighbor_ranks())
                }
                bf.win_put(tensor,
                           self_weight=self_weight,
                           dst_weights=dst_weights,
                           name=window_name,
                           require_mutex=True)
                bf.win_update(name=window_name, require_mutex=True)
                bf.win_accumulate(tensor,
                                  name=window_name,
                                  require_mutex=True,
                                  self_weight=self_weight,
                                  dst_weights=dst_weights)
                bf.win_update_then_collect(name=window_name)
            bf.barrier()
            bf.win_update_then_collect(name=window_name)
            associated_p = bf.win_associated_p(name=window_name)
            # Because the associated p should operate the same as tensor always
            # the following assert should be true no matter what order is excuted.
            assert abs(associated_p - tensor.data[0]) < EPSILON

        bf.turn_off_win_ops_with_associated_p()

Esempio n. 5

0

Mostra file

                for r in recv_neighbors
            }
            self_weight = 1 / (len(recv_neighbors) + 1)

        x = bf.neighbor_allreduce(x,
                                  name='x',
                                  self_weight=self_weight,
                                  neighbor_weights=neighbor_weights,
                                  send_neighbors=send_neighbors,
                                  enable_topo_check=False)
        mse.append(torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2))
else:
    outdegree = len(bf.out_neighbor_ranks())
    indegree = len(bf.in_neighbor_ranks())

    if not bf.nccl_built():  # NCCL do not support associated P yet.
        bf.turn_on_win_ops_with_associated_p()
        bf.win_create(x, name="x", zero_init=True)
        for i in range(args.max_iters):
            if args.enable_dynamic_topology:
                num_out_neighbors = len(bf.out_neighbor_ranks())
                sent_neighbor = bf.out_neighbor_ranks()[i % num_out_neighbors]
                dst_weights = {sent_neighbor: 0.5}
                self_weight = 0.5
            else:
                dst_weights = {
                    rank: 1.0 / (outdegree + 1)
                    for rank in bf.out_neighbor_ranks()
                }
                self_weight = 1 / (1 + outdegree)

Esempio n. 6

0

Mostra file

parser.add_argument('--batch_size', type=int, default=100,
        help="batch size (default: 100).")
parser.add_argument('--seed', type=int, default=3, 
        help='set seed (default: 3).')
parser.add_argument('--save_name', type=str, required=True, 
        help='The file_postfix to save log')

args = parser.parse_args()
cudnn.benchmark = True
cudnn.enabled = True
torch.manual_seed(args.seed)
np.random.seed(args.seed)

bf.init()

device_id = bf.local_rank() if bf.nccl_built() else bf.local_rank() % torch.cuda.device_count()
torch.cuda.set_device(device_id)
torch.cuda.manual_seed(args.seed)

kwargs = {"num_workers": 4, "pin_memory": True}

# load the data
if args.dataset == "MNIST":
    train_set, test_set = MNIST_dataset_flat_dist(bf.rank())
    NN_model = MNIST_two_layers
elif args.dataset == "MNIST_Conv":
    train_set, test_set = MNIST_dataset_dist(bf.rank())
    NN_model = LeNet
elif args.dataset == "CIFAR10":
    train_set, test_set = CIFAR10_dataset_dist(bf.rank()) 
    NN_model = vgg11