Beispiel #1
0
    def test_allreduce_ops(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())

        def allreduce(x, op):
            opts = c10d.AllreduceOptions()
            opts.reduceOp = op
            work = pg.allreduce([x], opts)
            work.wait()

        # Sum
        x = torch.Tensor([self.rank + 1.0])
        allreduce(x, c10d.ReduceOp.SUM)
        self.assertEqual(torch.Tensor([float(self.world_size * (self.world_size + 1) / 2)]), x)

        # Product
        x = torch.Tensor([self.rank + 1.0])
        allreduce(x, c10d.ReduceOp.PRODUCT)
        self.assertEqual(torch.Tensor([float(math.factorial(self.world_size))]), x)

        # Min
        x = torch.Tensor([self.rank + 1.0])
        allreduce(x, c10d.ReduceOp.MIN)
        self.assertEqual(torch.Tensor([1.0]), x)

        # Max
        x = torch.Tensor([self.rank + 1.0])
        allreduce(x, c10d.ReduceOp.MAX)
        self.assertEqual(torch.Tensor([self.world_size]), x)

        # Test overloaded convenience function (defaults to using sum)
        x = torch.Tensor([self.rank + 1.0])
        work = pg.allreduce(x)
        work.wait()
        self.assertEqual(torch.Tensor([float(self.world_size * (self.world_size + 1) / 2)]), x)
Beispiel #2
0
 def test_duplicated_names(self):
     store = dist.FileStore(self.file.name, self.world_size)
     dist.init_process_group(backend="gloo", rank=self.rank,
                             world_size=self.world_size, store=store)
     with self.assertRaisesRegex(RuntimeError, "is not unique"):
         dist.init_model_parallel("duplicated_name")
     dist.join_rpc()
Beispiel #3
0
 def wrapper(self):
     store = dist.FileStore(self.file.name, self.world_size)
     dist.init_process_group(backend='gloo', rank=self.rank,
                             world_size=self.world_size, store=store)
     dist.init_rpc('worker{}'.format(self.rank))
     func(self)
     dist.join_rpc()
Beispiel #4
0
        def test_all_gather_base(self):
            store = c10d.FileStore(self.file_name, self.world_size)
            c10d.init_process_group(store=store,
                                    rank=self.rank,
                                    world_size=self.world_size,
                                    backend='nccl')

            device = torch.device(f"cuda:{self.rank}")
            x = torch.ones(5, 5, device=device) + self.rank
            x.requires_grad = True

            output = torch.empty(5 * self.world_size, 5, device=device)
            output = torch.distributed.nn.functional._all_gather_base(
                output, x)
            self.assertEqual(output.size(), torch.Size(
                (5 * self.world_size, 5)))

            for idx in range(self.world_size):
                self.assertEqual(output[5 * idx:5 * (idx + 1)],
                                 torch.ones(5, 5, device=device) + idx)

            y = torch.sum(output.view(self.world_size, 5, 5), axis=0)
            z = y.sin().sum()
            z.backward()

            x_s = 2 * (3 * torch.ones(5, 5, device=device)).cos()
            self.assertEqual(x.grad, x_s)
Beispiel #5
0
    def test_send_recv_all_to_all(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())

        # Preallocate tensors for input/output
        inputs = [torch.Tensor([self.rank]) for _ in range(self.world_size)]
        outputs = [torch.Tensor([-1]) for _ in range(self.world_size)]

        # Issue sends
        send_work = []
        for i in range(self.world_size):
            if i == self.rank:
                continue
            send_work.append(pg.send([inputs[i]], i, 0))

        # Issue recvs
        recv_work = []
        for i in range(self.world_size):
            if i == self.rank:
                continue
            recv_work.append(pg.recv([outputs[i]], i, 0))

        # Wait for sends to complete
        for work in send_work:
            work.wait()

        # Wait for recvs to complete
        for work in recv_work:
            work.wait()

        # Test that every output other than our own contains the respective rank
        for i in range(self.world_size):
            if i == self.rank:
                continue
            self.assertEqual(torch.Tensor([i]), outputs[i])
Beispiel #6
0
 def _create_wrapper_pg(self, with_new_group=False, timeout=10.0):
     store = c10d.FileStore(self.file_name, self.world_size)
     c10d.init_process_group(
         backend="nccl",
         rank=self.rank,
         world_size=self.world_size,
         store=store,
         timeout=timedelta(seconds=timeout),
     )
     if with_new_group:
         pg = c10d.new_group(backend="nccl",
                             timeout=timedelta(seconds=timeout))
     else:
         _pg = c10d.ProcessGroupNCCL(store,
                                     self.rank,
                                     self.world_size,
                                     timeout=timedelta(seconds=timeout))
         pg = c10d._create_process_group_wrapper(
             _pg,
             "unused",
             store,
             self.rank,
             self.world_size,
             timeout=timeout,
         )
     return pg
Beispiel #7
0
    def test_invalid_names(self):
        store = dist.FileStore(self.file.name, self.world_size)
        dist.init_process_group(backend="gloo",
                                rank=self.rank,
                                world_size=self.world_size,
                                store=store)

        with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
            dist.init_model_parallel(self_name="abc*")

        with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
            dist.init_model_parallel(self_name=" ")

        with self.assertRaisesRegex(RuntimeError, "must be non-empty"):
            dist.init_model_parallel(self_name="")

        # If the number in the message does not match, it is likely that the
        # value of MAX_NAME_LEN in RPC WorkerId has changed.
        with self.assertRaisesRegex(RuntimeError, "shorter than 128"):
            dist.init_model_parallel(self_name="".join(
                ["a" for _ in range(500)]),
                                     backend=BACKEND,
                                     self_rank=self.rank,
                                     init_method=RPC_INIT_URL)
        dist.join_rpc()
Beispiel #8
0
    def test_queue_reduction(self):
        # Set up process group.
        store = c10d.FileStore(self.file.name)
        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        # Get this process' split of devices.
        devices = gpus_for_rank(self.world_size)[self.rank]
        grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) *
                       (self.rank + 1)).chunk(5)
                       for d in devices]

        work, local_grad_sum = c10d._queue_reduction(process_group,
                                                     grads_batch,
                                                     devices)
        # The first return value should be the allreduce work item.
        self.assertTrue(isinstance(work, c10d.Work))
        # The second return value will be the finished allreduced gradients.
        self.assertTrue(isinstance(local_grad_sum, torch.Tensor))

        # Wait for the allreduce to finish.
        work.wait()

        # The expected result of the allreduce should be the average
        self.assertEqual(local_grad_sum,
                         torch.ones(10) * (self.world_size + 1) / 2.0)
Beispiel #9
0
    def test_scatter(self):
        store = c10d.FileStore(self.file_name, self.world_size)
        # This is required because these functions calls directly to the .dist and needs
        # the world to be initialized
        c10d.init_process_group(store=store,
                                rank=self.rank,
                                world_size=self.world_size,
                                backend='gloo')
        device = torch.device(f"cuda:{self.rank}")
        x0 = torch.ones(5, 5, device=device)
        x1 = torch.ones(5, 5, device=device) + 1
        x0.requires_grad = True
        x1.requires_grad = True

        y = torch.distributed.nn.scatter([x0, x1], 1)
        if self.rank == 1:
            self.assertEqual(y, 1 + torch.ones(5, 5, device=device))
        elif self.rank == 0:
            self.assertEqual(y, torch.ones(5, 5, device=device))
        z = y.sin().sum()
        z.backward()

        # Test gradient
        if self.rank == 1:
            x0_s = torch.ones(5, 5, device=device).cos()
            x1_s = (2 * torch.ones(5, 5, device=device)).cos()
            self.assertEqual(x0.grad, x0_s)
            self.assertEqual(x1.grad, x1_s)
        if self.rank == 0:
            self.assertEqual(x0.grad, torch.zeros(5, 5, device=device))
Beispiel #10
0
        def test_reduce_scatter_non_contiguous(self):
            store = c10d.FileStore(self.file_name, self.world_size)
            # This is required because these functions calls directly to the .dist and needs
            # the world to be initialized
            c10d.init_process_group(store=store,
                                    rank=self.rank,
                                    world_size=self.world_size,
                                    backend='nccl')
            device = torch.device(f"cuda:{self.rank}")

            class NonContiguousGrad(torch.autograd.Function):
                @staticmethod
                def forward(ctx, input):
                    return input

                @staticmethod
                def backward(ctx, grad_output):
                    # Make grad non-contiguous
                    return grad_output.clone().transpose(0, 1)

            x0 = torch.rand(5, 5, device=device, requires_grad=True)
            x1 = torch.rand(5, 5, device=device, requires_grad=True)
            y = torch.empty(5, 5, device=device)

            y = torch.distributed.nn.reduce_scatter(y, [x0, x1])
            NonContiguousGrad.apply(y).sum().backward()
Beispiel #11
0
    def test_fp16(self):
        store = c10d.FileStore(self.file.name)
        process_group = c10d.ProcessGroupNCCL(store, self.rank,
                                              self.world_size)

        gpus = gpus_for_rank(self.world_size)[self.rank]
        model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half()
        nn.init.constant_(model.weight, 1)
        ddp_model = DistributedDataParallel(
            model,
            device_ids=[gpus[0]],
            process_group=process_group,
            bucket_cap_mb=1,
        )

        # Input 2**15, so that the gradients will overflow with a
        # world_size of 2, unless we normalize the gradient by the
        # world_size before the reduction
        input = torch.Tensor([[2**15]]).cuda(gpus[0]).half()

        # Step model
        ddp_model.train()
        output = ddp_model(input)
        loss = output.sum()
        loss.backward()

        self.assertFalse(
            any(torch.isinf(p.grad).any() for p in ddp_model.parameters()))
Beispiel #12
0
    def test_dist_broadcast_coalesced(self):
        store = c10d.FileStore(self.file.name)
        options = c10d.ProcessGroupGloo.Options()
        options.devices = [
            c10d.ProcessGroupGloo.create_tcp_device(interface="lo")
        ]
        process_group = c10d.ProcessGroupGloo(store, self.rank,
                                              self.world_size, options)

        device = torch.device('cuda')

        target = torch.arange(10, dtype=torch.float64, device=device).chunk(5)

        if self.is_master:
            # All processes should have these tensors in the end.
            tensors = target
        else:
            # Non-master processes start with empty tensors and should be
            # filled with the tensors from the master.
            tensors = torch.zeros(10, device=device).chunk(5)

        c10d._dist_broadcast_coalesced(process_group, tensors, buffer_size=10)

        if not self.is_master:
            self.assertEqual(tensors, target)
    def _test_base(self, net, inp, check_allclose=True):
        store = c10d.FileStore(self.file.name, self.world_size)
        process_group = c10d.ProcessGroupGloo(store, self.rank,
                                              self.world_size)

        ddp = nn.parallel.DistributedDataParallel(copy.deepcopy(net),
                                                  process_group=process_group)

        net_opt = torch.optim.Adam(net.parameters(), lr=0.001)
        ddp_opt = torch.optim.Adam(ddp.parameters(), lr=0.001)

        for i, j in zip(ddp.parameters(), net.parameters()):
            self.assertTrue(i.allclose(j))

        for _ in range(10):
            net_out = net(*inp)
            ddp_out = ddp(*inp)

            net_out.sum().backward()
            ddp_out.sum().backward()

            net_opt.step()
            ddp_opt.step()

        if check_allclose:
            for i, j in zip(ddp.parameters(), net.parameters()):
                self.assertTrue(i.allclose(j))
Beispiel #14
0
 def test_reduce_scatter(self):
     store = c10d.FileStore(self.file_name, self.world_size)
     # This is required because these functions calls directly to the .dist and needs
     # the world to be initialized
     c10d.init_process_group(store=store,
                             rank=self.rank,
                             world_size=self.world_size,
                             backend='nccl')
     device = torch.device(f"cuda:{self.rank}")
     x0 = torch.ones(5, 5, device=device) + self.rank
     x1 = torch.ones(5, 5, device=device) + self.rank + 1
     x0.requires_grad = True
     x1.requires_grad = True
     y = torch.empty_like(x0)
     expected = (1 + self.world_size
                 ) * self.world_size / 2 + self.world_size * self.rank
     y = torch.distributed.nn.reduce_scatter(y, [x0, x1])
     self.assertEqual(y, torch.ones(5, 5, device=device) * expected)
     z = y.sin().sum()
     z.backward()
     expected_0 = (1 + self.world_size) * self.world_size / 2
     expected_1 = expected_0 + self.world_size
     x_s_0 = (expected_0 * torch.ones(5, 5, device=device)).cos()
     x_s_1 = (expected_1 * torch.ones(5, 5, device=device)).cos()
     self.assertEqual(x0.grad, x_s_0)
     self.assertEqual(x1.grad, x_s_1)
Beispiel #15
0
    def test_gather(self):
        store = c10d.FileStore(self.file_name, self.world_size)
        # This is required because these functions calls directly to the .dist and needs
        # the world to be initialized
        c10d.init_process_group(store=store,
                                rank=self.rank,
                                world_size=self.world_size,
                                backend='gloo')
        device = torch.device(f"cuda:{self.rank}")
        x = torch.ones(5, 5, device=device) + self.rank
        x.requires_grad = True
        tensors = torch.distributed.nn.gather(x, 1)
        if self.rank == 1:
            for i, t in enumerate(tensors):
                self.assertEqual(t, torch.ones(5, 5, device=device) + i)
        elif self.rank == 0:
            for i, t in enumerate(tensors):
                zeros = torch.zeros(5, 5, device=device)
                self.assertEqual(t, zeros)
        y = torch.sum(torch.stack(tensors), axis=0)
        z = y.sin().sum()
        z.backward()

        # Test gradient
        x_s = 3 * torch.ones(5, 5, device=device)
        self.assertEqual(x.grad, x_s.cos())
Beispiel #16
0
    def test_broadcast_ops(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size,
                                   self.opts())

        def broadcast(xs, rootRank, rootTensor):
            opts = c10d.BroadcastOptions()
            opts.rootRank = rootRank
            opts.rootTensor = rootTensor
            work = pg.broadcast(xs, opts)
            work.wait()

        # Every rank is root once, every tensor index is root once
        for i in range(self.world_size):
            for j in range(2):
                xs = [
                    torch.Tensor([self.rank * self.world_size + 0.0]),
                    torch.Tensor([self.rank * self.world_size + 1.0]),
                ]

                broadcast(xs, i, j)
                self.assertEqual(torch.Tensor([i * self.world_size + j]),
                                 xs[0])
                self.assertEqual(torch.Tensor([i * self.world_size + j]),
                                 xs[1])

        # Test overloaded convenience function
        x = torch.Tensor([self.rank + 1.0])
        work = pg.broadcast(x, root=0)
        work.wait()
        self.assertEqual(torch.Tensor([1.0]), x)
Beispiel #17
0
    def test_sync_params_with_buffers(self):
        store = c10d.FileStore(self.file.name)
        options = c10d.ProcessGroupGloo.Options()
        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)

        devices = gpus_for_rank(self.world_size)[self.rank]
        target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5)
        parameter_data = [target]
        parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]]

        # sync_params should do a dist_broadcast for buffers, so we only populate the master buffers and
        # then check that other processes' tensors end up matching.

        if self.is_master:
            buffer_data = [target]
            buffer_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]]
        else:
            buffer_data = [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices]

        c10d._sync_params(
            process_group,
            parameter_data=parameter_data,
            buffer_data=buffer_data,
            devices=devices,
            broadcast_bucket_size=10,
            broadcast_buffers=True)

        for device_data in parameter_data:
            for i, parameter in enumerate(device_data):
                self.assertEqual(parameter, target[i])

        for device_data in buffer_data:
            for i, buffer in enumerate(device_data):
                self.assertEqual(buffer, target[i])
Beispiel #18
0
    def test_is_last_hook(self):

        store = dist.FileStore(self.file_name, self.world_size)
        process_group = dist.ProcessGroupNCCL(store, self.rank,
                                              self.world_size)

        def hook(flags, bucket):
            flags.append(bucket.is_last())
            fut = torch.futures.Future()
            fut.set_result(bucket.buffer())
            return fut

        flags = []
        device_id = gpus_for_rank(self.world_size)[self.rank][0]
        model = nn.Sequential(
            nn.Linear(2, 4000, bias=False),
            *[nn.Linear(4000, 4000, bias=False) for _ in range(10)])
        gpu_model = DistributedDataParallel(
            model.to(device_id),
            device_ids=[device_id],
            process_group=process_group,
        )
        gpu_model.register_comm_hook(state=flags, hook=hook)
        input = torch.randn(10, 2)
        gpu_model(input).sum().backward()
        self.assertTrue(flags[-1])
        self.assertFalse(any(flags[:-1]))
Beispiel #19
0
 def dist_init(self, rank, world_size=-1):
     store = dist.FileStore(
         self.file_name, self.world_size if world_size < 1 else world_size)
     return dist.init_process_group(backend=BACKEND,
                                    store=store,
                                    rank=rank,
                                    world_size=self.world_size)
Beispiel #20
0
 def test_all_gather_bfp16(self):
     store = dist.FileStore(self.file_name, self.world_size)
     dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo')
     device = torch.device(f"cuda:{self.rank}")
     group = list(range(0, self.world_size))
     group_id = dist.group.WORLD
     self._test_all_gather(group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16)
Beispiel #21
0
    def _test_warn_not_in_group(self, backend):
        store = dist.FileStore(self.file_name, self.world_size)
        dist.init_process_group(
            backend,
            world_size=self.world_size,
            rank=self.rank,
            store=store,
        )
        in_group_ranks = list(
            filter(lambda x: x % 2 == 0, range(self.world_size)))
        group = dist.new_group(in_group_ranks)

        x = torch.zeros(2, 2).cuda(self.rank)
        xs = [
            torch.zeros(2, 2).cuda(self.rank)
            for _ in range(len(in_group_ranks))
        ]
        if self.rank not in in_group_ranks:
            msg = ".*{}.*does not belong to.*"
            with self.assertWarnsOnceRegex(UserWarning,
                                           msg.format("all_gather")):
                dist.all_gather(xs, x, group=group)
            with self.assertWarnsOnceRegex(UserWarning,
                                           msg.format("all_reduce")):
                dist.all_reduce(x, group=group)
            with self.assertWarnsOnceRegex(UserWarning, msg.format("barrier")):
                dist.barrier(group=group)
            with self.assertWarnsOnceRegex(UserWarning,
                                           msg.format("broadcast")):
                dist.broadcast(x, src=0, group=group)
        else:
            dist.all_gather(xs, x, group=group)
            dist.all_reduce(x, group=group)
            dist.barrier(group=group)
            dist.broadcast(x, src=0, group=group)
 def _test_all_to_all_single(self, backend):
     store = c10d.FileStore(self.file_name, self.world_size)
     # This is required because these functions calls directly to the .dist and needs
     # the world to be initialized
     c10d.init_process_group(store=store,
                             rank=self.rank,
                             world_size=self.world_size,
                             backend=backend)
     device = torch.device(f"cuda:{self.rank}")
     row = self.world_size * (self.rank + 1) * (self.world_size + 1) / 2
     x = torch.ones(int(row), 5, device=device) * (self.rank + 1)
     x.requires_grad = True
     y = torch.empty_like(x)
     split_sizes = [(i + 1) * (self.rank + 1)
                    for i in range(self.world_size)]
     y = torch.distributed.nn.all_to_all_single(
         y,
         x,
         output_split_sizes=split_sizes,
         input_split_sizes=split_sizes)
     expected = []
     for idx, tensor in enumerate(torch.split(x, split_sizes)):
         expected.append(torch.full_like(tensor, (idx + 1)))
     expected = torch.cat(expected)
     self.assertEqual(y, expected)
     z = y.sin().sum()
     z.backward()
     x_s = ((self.rank + 1) * torch.ones(int(row), 5, device=device)).cos()
     self.assertEqual(x.grad, x_s)
Beispiel #23
0
    def test_sync_params_no_buffers(self):
        store = c10d.FileStore(self.file.name)
        options = c10d.ProcessGroupGloo.Options()
        options.devices = [
            c10d.ProcessGroupGloo.create_tcp_device(interface="lo")
        ]
        process_group = c10d.ProcessGroupGloo(store, self.rank,
                                              self.world_size, options)

        # Use all available devices on every process here (data is small, so should be fine).
        devices = gpus_for_rank(self.world_size)[self.rank]
        target = torch.arange(10, dtype=torch.float64,
                              device='cuda:0').chunk(5)
        parameter_data = [target]
        parameter_data += [
            torch.zeros(10, device=torch.device('cuda', d)).chunk(5)
            for d in devices[1:]
        ]
        buffer_data = [[]] * len(parameter_data)

        c10d._sync_params(process_group,
                          parameter_data=parameter_data,
                          buffer_data=buffer_data,
                          devices=devices,
                          broadcast_bucket_size=10,
                          broadcast_buffers=False)

        for device_data in parameter_data:
            for i, parameter in enumerate(device_data):
                self.assertEqual(parameter, target[i])
Beispiel #24
0
 def wrapper(self):
     store = dist.FileStore(self.file.name, self.world_size)
     dist.init_process_group(backend='gloo', rank=self.rank,
                             world_size=self.world_size, store=store)
     dist.init_model_parallel('worker%d' % self.rank)
     func(self)
     dist.join_rpc()
Beispiel #25
0
 def dist_init(self, rank, world_size=-1, backend=BACKEND):
     if (world_size < 1):
         world_size = self.world_size
     store = dist.FileStore(self.file_name, world_size)
     return dist.init_process_group(backend=backend,
                                    store=store,
                                    rank=rank,
                                    world_size=world_size)
Beispiel #26
0
 def test_gloo_backend(self):
     store = c10d.FileStore(self.file.name)
     options = c10d.ProcessGroupGloo.Options()
     options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
     process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
     gpus = gpus_for_rank(self.world_size)[self.rank]
     self._test_ddp_with_process_group(process_group, gpus)
     self._test_ddp_with_process_group(process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
Beispiel #27
0
    def train(self, model, data):
        torch.manual_seed(0)
        model = model.cuda(self.rank)
        for i in range(len(data)):
            data[i][0] = data[i][0].cuda(self.rank)
            data[i][1] = data[i][1].cuda(self.rank)
        torch.cuda.synchronize(self.rank)

        process_group_size = self.trainer_count

        store = c10d.FileStore("/tmp/tmpn_k_8so02", process_group_size)

        process_group = c10d.ProcessGroupNCCL(store, self.rank,
                                              process_group_size)

        ddp_model = DDP(model,
                        device_ids=[self.rank],
                        process_group=process_group)

        hook_state = self.HookState(self, process_group)

        ddp_model.register_comm_hook(hook_state, DdpNcclTrainer.hook)

        criterion = nn.CrossEntropyLoss().cuda(self.rank)

        optimizer = torch.optim.SGD(ddp_model.parameters(), 1e-4)

        def epoch_key(epoch, index):
            return f"{epoch},{index}"

        for epoch in range(self.epochs):
            for index, batch in enumerate(data):
                hook_state.next_batch_state()
                input, target = batch[0], batch[1]

                self.record_batch_start(epoch_key(epoch, index))

                optimizer.zero_grad()

                self.record_forward_start(epoch_key(epoch, index))

                out = ddp_model(input)

                self.record_forward_end(epoch_key(epoch, index))

                loss = criterion(out, target)

                self.record_backward_start(epoch_key(epoch, index))

                loss.backward()

                self.record_backward_end(epoch_key(epoch, index))

                optimizer.step()

                self.record_batch_end(epoch_key(epoch, index))

        torch.cuda.synchronize(self.rank)
Beispiel #28
0
 def test_nccl_backend(self):
     store = c10d.FileStore(self.file.name)
     process_group = c10d.ProcessGroupNCCL(store, self.rank,
                                           self.world_size)
     gpus = gpus_for_rank(self.world_size)[self.rank]
     self._test_ddp_with_process_group(process_group, gpus)
     self._test_ddp_with_process_group(
         process_group,
         list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
Beispiel #29
0
 def test_reinit(self):
     store = dist.FileStore(self.file.name, self.world_size)
     dist.init_process_group(backend="gloo", rank=self.rank,
                             world_size=self.world_size, store=store)
     with self.assertRaisesRegex(RuntimeError, "is not unique"):
         dist.init_model_parallel(self_name="duplicate_name",
                                  backend=BACKEND,
                                  self_rank=self.rank,
                                  init_method=RPC_INIT_URL)
     dist.join_rpc()
Beispiel #30
0
 def wrapper(self):
     store = dist.FileStore(self.file.name, self.world_size)
     dist.init_process_group(backend='gloo', rank=self.rank,
                             world_size=self.world_size, store=store)
     dist.init_model_parallel(self_name='worker%d' % self.rank,
                              backend=BACKEND,
                              self_rank=self.rank,
                              init_method=RPC_INIT_URL)
     func(self)
     dist.join_rpc()