def test_ddp_dist_autograd_sparse_grads(self):
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(
            backend="gloo",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        model = nn.EmbeddingBag(10, 3, sparse=True)
        ddp_model = DistributedDataParallel(model)

        # Different inputs for each
        input = torch.LongTensor(10).random_(0, 10)
        offsets = torch.LongTensor([0, 4])

        # Run local.
        loss = ddp_model(input, offsets).sum()
        loss.backward()

        with dist_autograd.context() as context_id:
            loss = ddp_model(input, offsets).sum()
            dist_autograd.backward(context_id, [loss])
            grads_dict = dist_autograd.get_gradients(context_id)
            self.assertEqual(1, len(grads_dict))
            self.assertEqual(model.weight.grad, grads_dict[model.weight])
Exemple #2
0
    def _run_basic_test(self, backend, checkpoint):
        dist.init_process_group(
            backend="nccl",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        # Use 4 GPUs, two replicas of a pipe across GPU 0 and 1 and another
        # pipe between GPU 2 and 3. Both replicas are replicated via DDP.
        fc1 = nn.Linear(16, 8).cuda(2 * self.rank)
        fc2 = nn.Linear(8, 4).cuda(2 * self.rank + 1)
        model = nn.Sequential(
            fc1,
            fc2
        )
        model = Pipe(model, chunks=2, checkpoint=checkpoint)
        model = DistributedDataParallel(model)
        out = model(torch.rand(16, 16).cuda(2 * self.rank)).local_value()
        out.sum().backward()

        # Check grads
        output = [torch.empty_like(fc1.weight.grad), torch.empty_like(fc1.weight.grad)]
        dist.all_gather(output, fc1.weight.grad)
        self.assertEqual(output[0], output[1])

        output = [torch.empty_like(fc2.weight.grad), torch.empty_like(fc2.weight.grad)]
        dist.all_gather(output, fc2.weight.grad)
        self.assertEqual(output[0], output[1])
Exemple #3
0
    def _run_test_ddp_comparision(self, simulate_uneven_inputs=False):
        gLogger.info(f"Running trainer rank: {self.rank}")
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(
            backend="gloo",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )
        net = nn.Linear(2, 3)
        ddp_net = DistributedDataParallel(net)

        # Odd ranks join early if simulate_uneven_inputs.
        num_inputs = 1
        if simulate_uneven_inputs:
            if self.rank % 2 == 0:
                num_inputs += 2
        inputs_list = [torch.rand((3, 2)) for _ in range(num_inputs)]

        if simulate_uneven_inputs:
            gLogger.info(
                f"Rank {self.rank} training with {len(inputs_list)} inputs.")

        # Use distributed autograd. The gradients will be in RPC context map.
        grads_dict = {}
        with ddp_net.join(simulate_uneven_inputs):
            for i, inputs in enumerate(inputs_list):
                with dist_autograd.context() as context_id:
                    loss = ddp_net(inputs).norm()
                    dist_autograd.backward(context_id, [loss])
                    grads_dict = dist_autograd.get_gradients(context_id)
                gLogger.info(
                    f"Trainer #{self.rank} got grad dict: {grads_dict}")

                # Use local autograd. The gradients will be in each variable's '.grad'.
                ddp_net.zero_grad()
                loss = ddp_net(inputs).norm()
                loss.backward()

                # The gradients should be the same
                for param in net.parameters():
                    self.assertTrue(
                        param in grads_dict,
                        msg=
                        f"Param {param} is not in dist_auto grad dict {grads_dict} for iteration {i}",
                    )
                    self.assertEqual(
                        grads_dict[param],
                        param.grad,
                        msg=
                        f"The grads for param {param} are different under local "
                        f"and dist autograd: {param.grad} \n---\n {grads_dict[param]} for iteration {i}",
                    )
        dist.destroy_process_group()
    def _run_basic_test(self, backend, checkpoint, find_unused_parameters=False, static_graph=False):
        dist.init_process_group(
            backend="nccl",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        # Use 4 GPUs, two replicas of a pipe across GPU 0 and 1 and another
        # pipe between GPU 2 and 3. Both replicas are replicated via DDP.
        fc1 = nn.Linear(16, 8, bias=False).cuda(2 * self.rank)

        class MyModule(nn.Module):
            def __init__(self, device):
                super(MyModule, self).__init__()
                self.fc2 = nn.Linear(8, 4, bias=False).cuda(device)
                self.fc3 = nn.Linear(4, 2, bias=False).cuda(device)

            def forward(self, inp):
                if find_unused_parameters:
                    return self.fc2(inp)
                else:
                    return self.fc3(self.fc2(inp))

        layer2 = MyModule(2 * self.rank + 1)
        model = nn.Sequential(
            fc1,
            layer2
        )
        model = Pipe(model, chunks=2, checkpoint=checkpoint)
        model = DistributedDataParallel(model, find_unused_parameters=find_unused_parameters)
        if static_graph:
            model._set_static_graph()
        out = model(torch.rand(16, 16).cuda(2 * self.rank)).local_value()
        out.sum().backward()

        # Run forward again for find_unused_parameters to trigger any potential errors.
        if find_unused_parameters:
            model(torch.rand(16, 16).cuda(2 * self.rank))

        # Check grads
        output = [torch.empty_like(fc1.weight.grad), torch.empty_like(fc1.weight.grad)]
        dist.all_gather(output, fc1.weight.grad)
        self.assertEqual(output[0], output[1])

        output = [torch.empty_like(layer2.fc2.weight.grad), torch.empty_like(layer2.fc2.weight.grad)]
        dist.all_gather(output, layer2.fc2.weight.grad)
        self.assertEqual(output[0], output[1])

        if not find_unused_parameters:
            output = [torch.empty_like(layer2.fc3.weight.grad), torch.empty_like(layer2.fc3.weight.grad)]
            dist.all_gather(output, layer2.fc3.weight.grad)
            self.assertEqual(output[0], output[1])
Exemple #5
0
 def _remote_worker_process(self):
     gLogger.info("The remote worker is running.")
     dist.init_process_group(
         backend="gloo",
         init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
         world_size=self.world_size,
         rank=self.rank,
     )
     global shutdown_signal
     with shutdown_signal:
         shutdown_signal.wait()
     gLogger.info("Exiting remote worker.")
     dist.destroy_process_group()
Exemple #6
0
 def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool):
     gLogger.info("Running the master process...")
     dist.init_process_group(
         backend="gloo",
         init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
         world_size=self.world_size,
         rank=self.rank,
     )
     remote_em_rref = rpc.remote(self.remote_worker_name(),
                                 RemoteEM,
                                 args=(NUM_EM_ROW, D_SPARSE))
     remote_net_rref = rpc.remote(self.remote_worker_name(),
                                  RemoteNet,
                                  args=(D_DENSE + D_SPARSE, D_HID))
     gLogger.info("Created remote rrefs on master")
     self.do_test_on_master(ddp_mode, simulate_uneven_inputs,
                            remote_em_rref, remote_net_rref)
    def test_ddp_dist_autograd_local_vs_remote(self):
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(
            backend="gloo",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        # Use two different remote device input string, w/ and w/o the default
        # device string "cpu", respectively.
        for remote_device in ["worker0/cpu", "worker0"]:
            remote_layer1 = RemoteModule(
                remote_device=remote_device, module_cls=nn.Linear, args=(10, 5, False)
            )
            layer1 = nn.Linear(10, 5, False)
            # Start with the same parameters for remote and local
            layer1.weight = remote_layer1.module_rref.to_here().weight

            # Run local case.
            layer2 = nn.Linear(5, 1)
            inputs = torch.rand((10, 10))
            ddp_model = DistributedDataParallel(layer2)
            loss = ddp_model(layer1(inputs)).sum()
            loss.backward()

            # Run remote case.
            with dist_autograd.context() as context_id:
                loss = ddp_model(remote_layer1(inputs)).sum()
                dist_autograd.backward(context_id, [loss])
                grads_dict = dist_autograd.get_gradients(context_id)
                dist.barrier()
                self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight])
                self.assertEqual(
                    layer1.weight.grad,
                    rpc.rpc_sync(
                        "worker0",
                        CommonDdpComparisonTest.get_remote_grads,
                        args=(remote_layer1.module_rref, context_id),
                    ),
                )
    def _remote_worker_process(self, ddp_mode):
        gLogger.info("The remote worker is running.")
        dist.init_process_group(
            backend="gloo",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE):
            # new_group needs to be called on ranks.
            dist.new_group(TRAINER_RANKS)

        global shutdown_signal
        with shutdown_signal:
            shutdown_signal.wait()
        gLogger.info("Exiting remote worker.")
        dist.destroy_process_group()
    def _trainer_process(self, rank: int):
        gLogger.info(f"Running the trainer #{rank}...")
        gLogger.info(
            f"Initing trainer process group by trainer #{rank} with ranks {TRAINER_RANKS}"
        )
        dist.init_process_group(
            backend="gloo",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        gLogger.info(f"Waiting for shutdown signal on trainer #{rank}...")

        global shutdown_signal
        with shutdown_signal:
            shutdown_signal.wait()
        gLogger.info(f"Exiting the trainer #{rank}...")
        dist.destroy_process_group()
Exemple #10
0
    def test_ddp_dist_autograd_local_vs_remote_gpu(self):
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(
            backend="gloo",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        remote_layer1 = RemoteModule(remote_device="worker0/cpu",
                                     module_cls=nn.Linear,
                                     args=(10, 7, False))
        layer1 = nn.Linear(10, 7, False)
        # Start with the same parameters for remote and local
        layer1.weight = remote_layer1.module_rref.to_here().weight

        layer2 = nn.Linear(7, 5).cuda(self.rank)
        ddp_layer2 = DistributedDataParallel(layer2, device_ids=[self.rank])

        remote_layer3 = RemoteModule(remote_device="worker0/cpu",
                                     module_cls=nn.Linear,
                                     args=(5, 3, False))
        layer3 = nn.Linear(5, 3, False)
        # Start with the same parameters for remote and local
        layer3.weight = remote_layer3.module_rref.to_here().weight

        layer4 = nn.Linear(3, 1).cuda(self.rank)
        ddp_layer4 = DistributedDataParallel(layer4, device_ids=[self.rank])

        # Run local case.
        inputs = torch.rand((10, 10))
        loss = ddp_layer4(
            layer3(ddp_layer2(layer1(inputs).cuda(self.rank)).cpu()).cuda(
                self.rank)).sum()
        loss.backward()

        # Run remote case.
        with dist_autograd.context() as context_id:
            loss = ddp_layer4(
                remote_layer3(
                    ddp_layer2(remote_layer1(inputs).cuda(
                        self.rank)).cpu()).cuda(self.rank)).sum()
            dist_autograd.backward(context_id, [loss])
            grads_dict = dist_autograd.get_gradients(context_id)
            dist.barrier()
            self.assertEqual(
                layer1.weight.grad,
                rpc.rpc_sync(
                    "worker0",
                    DdpComparisonTest.get_remote_grads,
                    args=(remote_layer1.module_rref, context_id),
                ),
            )
            self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight])
            self.assertEqual(
                layer3.weight.grad,
                rpc.rpc_sync(
                    "worker0",
                    DdpComparisonTest.get_remote_grads,
                    args=(remote_layer3.module_rref, context_id),
                ),
            )
            self.assertEqual(layer4.weight.grad, grads_dict[layer4.weight])
Exemple #11
0
    def _run_basic_test(self,
                        backend,
                        checkpoint,
                        find_unused_parameters=False,
                        static_graph=False):
        dist.init_process_group(
            backend=backend,
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        # Use 4 GPUs, two replicas of a pipe across GPU 0 and 1 and another
        # pipe between GPU 2 and 3. Both replicas are replicated via DDP.
        fc1 = nn.Linear(16, 8, bias=False).cuda(2 * self.rank)

        class MyModule(nn.Module):
            def __init__(self, device):
                super(MyModule, self).__init__()
                self.fc2 = nn.Linear(8, 4, bias=False).cuda(device)
                self.fc3 = nn.Linear(4, 2, bias=False).cuda(device)

            def forward(self, inp):
                if find_unused_parameters:
                    return self.fc2(inp)
                else:
                    return self.fc3(self.fc2(inp))

        layer2 = MyModule(2 * self.rank + 1)
        model = nn.Sequential(fc1, layer2)
        model = Pipe(model, chunks=2, checkpoint=checkpoint)
        model = DistributedDataParallel(
            model,
            find_unused_parameters=find_unused_parameters,
            static_graph=static_graph,
        )

        # Ensure inputs are different across ranks to verify that gradient
        # sync indeed occurs.
        model_input = torch.rand(16, 16).cuda(2 * self.rank) * (self.rank + 1)
        out = model(model_input).local_value()
        out.sum().backward()

        # Run forward again for find_unused_parameters to trigger any potential errors.
        if find_unused_parameters:
            # Ensure inputs are different across ranks to verify that gradient
            # sync indeed occurs.
            unused_param_input = torch.rand(16, 16).cuda(
                2 * self.rank) * (self.rank + 1)
            model(unused_param_input).local_value().sum().backward()

        # Run a few more iterations of fwd + bwd to ensure gradient synchronization
        # occurs properly across iterations via delay_all_reduce/bucketized allreduce.
        for _ in range(3):
            model_input = torch.rand(16, 16).cuda(
                2 * self.rank) * (self.rank + 1)
            out = model(model_input).local_value()
            out.sum().backward()

        # Check grads
        output = [
            torch.empty_like(fc1.weight.grad),
            torch.empty_like(fc1.weight.grad)
        ]
        dist.all_gather(output, fc1.weight.grad)
        self.assertEqual(output[0], output[1])

        output = [
            torch.empty_like(layer2.fc2.weight.grad),
            torch.empty_like(layer2.fc2.weight.grad)
        ]
        dist.all_gather(output, layer2.fc2.weight.grad)
        self.assertEqual(output[0], output[1])

        if not find_unused_parameters:
            output = [
                torch.empty_like(layer2.fc3.weight.grad),
                torch.empty_like(layer2.fc3.weight.grad)
            ]
            dist.all_gather(output, layer2.fc3.weight.grad)
            self.assertEqual(output[0], output[1])