Ejemplo n.º 1
0
def run_trainer(args, emb_rref_list):
    """
    Trainer function to be run from each machine. This function:
        1. Performs some basic initialization steps.
        2. Prepares random data for training.
        3. Sanity checks cmd-line args such as embedding sizes and MLP layers
        4. Sets up the model, loss, and Distributed Optimizer
        5. Runs the Training Loop
    """

    ######## BASIC INITIALIZATION ########
    set_rand_seed()
    set_print_options(args.print_precision)

    args.use_gpu = args.use_gpu and torch.cuda.is_available()
    init_gpu(args.use_gpu)
    #print(args)

    ######## PREPARE TRAINING DATA ########
    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
    # input and target at random
    ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
    m_den = ln_bot[0]
    train_data, train_loader = dp.make_random_data_and_loader(
        args, ln_emb, m_den)
    nbatches = args.num_batches if args.num_batches > 0 else len(train_loader)

    ######## PARSE CMD LINE ARGS ########
    m_spa = args.arch_sparse_feature_size
    num_fea = ln_emb.size + 1  # num sparse + num dense features
    m_den_out = ln_bot[ln_bot.size - 1]
    if args.arch_interaction_op == "dot":
        # approach 1: all
        # num_int = num_fea * num_fea + m_den_out
        # approach 2: unique
        if args.arch_interaction_itself:
            num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
        else:
            num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
    elif args.arch_interaction_op == "cat":
        num_int = num_fea * m_den_out
    else:
        sys.exit("ERROR: --arch-interaction-op=" + args.arch_interaction_op +
                 " is not supported")
    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")

    ######## SANITY CHECKS ########
    # Ensure feature sizes and MLP dimensions match
    if m_den != ln_bot[0]:
        sys.exit("ERROR: arch-dense-feature-size " + str(m_den) +
                 " does not match first dim of bottom mlp " + str(ln_bot[0]))
    if m_spa != m_den_out:
        sys.exit("ERROR: arch-sparse-feature-size " + str(m_spa) +
                 " does not match last dim of bottom mlp " + str(m_den_out))
    if num_int != ln_top[0]:
        sys.exit("ERROR: # of feature interactions " + str(num_int) +
                 " does not match first dimension of top mlp " +
                 str(ln_top[0]))

    # test prints (model arch)
    if args.debug_mode:
        print("model arch:")
        print("mlp top arch " + str(ln_top.size - 1) +
              " layers, with input to output dimensions:")
        print(ln_top)
        print("# of interactions")
        print(num_int)
        print("mlp bot arch " + str(ln_bot.size - 1) +
              " layers, with input to output dimensions:")
        print(ln_bot)
        print("# of features (sparse and dense)")
        print(num_fea)
        print("dense feature size")
        print(m_den)
        print("sparse feature size")
        print(m_spa)
        print("# of embeddings (= # of sparse features) " + str(ln_emb.size) +
              ", with dimensions " + str(m_spa) + "x:")
        print(ln_emb)

        print("data (inputs and targets):")
        for j, (X, offsets, indices, T) in enumerate(train_loader):
            # early exit if nbatches was set by the user and has been exceeded
            if nbatches > 0 and j >= nbatches:
                break

            print("mini-batch: %d" % j)
            print(X.detach().cpu().numpy())
            # transform offsets to lengths when printing
            print([
                np.diff(S_o.detach().cpu().tolist() +
                        list(indices[i].shape)).tolist()
                for i, S_o in enumerate(offsets)
            ])
            print([S_i.detach().cpu().tolist() for S_i in indices])
            print(T.detach().cpu().numpy())

    ######## TRAINING SETUP ########

    # Initialize the model (note we are passing the list of RRefs that point to
    # the remote embeddings).
    dlrm = model.DLRM_RPC(
        emb_rref_list,
        args.distributed_rank,
        args.use_gpu,
        ln_emb,
        ln_bot,
        ln_top,
        arch_interaction_op=args.arch_interaction_op,
        arch_interaction_itself=args.arch_interaction_itself,
        sigmoid_bot=-1,
        sigmoid_top=ln_top.size - 2,
    )

    # Specify the loss function
    loss_fn = torch.nn.MSELoss(reduction="mean")

    model_parameter_rrefs = []
    # RRefs for embeddings from PS
    for ind, emb_rref in enumerate(emb_rref_list):
        ps_name = "ps{}".format(ind)
        model_parameter_rrefs.extend(
            rpc.rpc_sync(ps_name,
                         _retrieve_embedding_parameters,
                         args=(emb_rref, )))
    # RRefs local to the model (MLP)
    for param in dlrm.parameters():
        model_parameter_rrefs.append(RRef(param))

    # Build DistributedOptimizer.
    opt = DistributedOptimizer(
        optim.SGD,
        model_parameter_rrefs,
        lr=args.learning_rate,
    )

    def time_wrap(use_gpu):
        if use_gpu:
            torch.cuda.synchronize()
        return time.time()

    # TODO: uncomment for comp/comms DDP benchmark
    #if args.distributed_rank == 0:
    #    state_dict_top = {}
    #    state_dict_bot = {}
    #    dlrm.top_mlp_ddp.register_comm_hook(state_dict_top, profile_hook)
    #    dlrm.bot_mlp_ddp.register_comm_hook(state_dict_bot, profile_hook)

    # training or inference
    best_gA_test = 0
    best_auc_test = 0
    total_time = 0
    total_loss = 0
    total_accu = 0
    total_iter = 0
    total_samp = 0

    # Lists to track forward and backwad times per iteration
    fwd_times = []
    bwd_times = []

    rpc_fwd_times = []
    embedding_lookup_times = []

    ######## RUN TRAINING LOOP ########
    with torch.autograd.profiler.profile(enabled=args.enable_profiling,
                                         use_cuda=args.use_gpu) as prof:
        for epoch in range(args.nepochs):

            accum_time_begin = time_wrap(args.use_gpu)

            if args.mlperf_logging:
                previous_iteration_time = None

            for j, (X, offsets, indices, T) in enumerate(train_loader):

                if args.mlperf_logging:
                    current_time = time_wrap(args.use_gpu)
                    if previous_iteration_time:
                        iteration_time = current_time - previous_iteration_time
                    else:
                        iteration_time = 0
                    previous_iteration_time = current_time
                else:
                    t1 = time_wrap(args.use_gpu)

                # early exit if nbatches was set by the user and has been exceeded
                if nbatches > 0 and j >= nbatches:
                    break

                # create distributed autograd context
                with dist_autograd.context() as context_id:
                    # Run forward pass
                    fwd_start = time_wrap(args.use_gpu)
                    Z, rpc_delays, embed_lookup_delay, rpc_total = dlrm.forward(
                        X, offsets, indices)
                    fwd_end = time_wrap(args.use_gpu)

                    # Compute Loss
                    E = loss_fn(Z, T)

                    # Run distributed backward pass
                    bwd_start = time_wrap(args.use_gpu)
                    dist_autograd.backward(context_id, [E])
                    bwd_end = time_wrap(args.use_gpu)

                    # Run distributed optimizer
                    opt.step(context_id)

                    if epoch >= args.warmup_epochs:
                        fwd_times.append(fwd_end - fwd_start)
                        bwd_times.append(bwd_end - bwd_start)
                        rpc_fwd_times.extend(rpc_delays)
                        embedding_lookup_times.append(embed_lookup_delay)

                # compute loss and accuracy
                L = E.detach().cpu().numpy()  # numpy array
                S = Z.detach().cpu().numpy()  # numpy array
                T = T.detach().cpu().numpy()  # numpy array
                mbs = T.shape[
                    0]  # = args.mini_batch_size except maybe for last
                A = np.sum((np.round(S, 0) == T).astype(np.uint8))

                if args.mlperf_logging:
                    total_time += iteration_time
                else:
                    t2 = time_wrap(args.use_gpu)
                    total_time += t2 - t1
                total_accu += A
                total_loss += L * mbs
                total_iter += 1
                total_samp += mbs

                should_print = ((j + 1) % args.print_freq
                                == 0) or (j + 1 == nbatches)
                should_test = ((args.test_freq > 0)
                               and (args.data_generation == "dataset")
                               and (((j + 1) % args.test_freq == 0) or
                                    (j + 1 == nbatches)))

                # print time, loss and accuracy
                if should_print or should_test:
                    gT = 1000.0 * total_time / total_iter if args.print_time else -1
                    total_time = 0

                    gA = total_accu / total_samp
                    total_accu = 0

                    gL = total_loss / total_samp
                    total_loss = 0

                    str_run_type = "inference" if args.inference_only else "training"
                    print(
                        "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".
                        format(str_run_type, j + 1, nbatches, epoch, gT) +
                        "loss {:.6f}, accuracy {:3.3f} %".format(gL, gA * 100))

                    log_iter = nbatches * epoch + j + 1
                    # Uncomment the line below to print out the total time with overhead
                    # print("Accumulated time so far: {}" \
                    # .format(time_wrap(args.use_gpu) - accum_time_begin))
                    total_iter = 0
                    total_samp = 0

        # END TRAIN LOOP
        # TODO: uncomment for comp/comms DDP benchmark
        # TODO: for bottom also
        #torch.cuda.synchronize(args.distributed_rank)
        #if args.distributed_rank == 0:
        #    for bucket_index in range(len(state_dict_top)):
        #        e_bfr = state_dict[bucket_index]["e_bfr"]
        #        e_aft = state_dict[bucket_index]["e_aft"]
        #        print(f"bucket {bucket_index} comm time: {e_bfr.elapsed_time(e_aft)}")

        mean_fwd = 1000.0 * np.mean(fwd_times)
        mean_bwd = 1000.0 * np.mean(bwd_times)
        std_fwd = 1000.0 * np.std(fwd_times)
        std_bwd = 1000.0 * np.std(bwd_times)
        rpc_fwd_mean = 1000.0 * np.mean(rpc_fwd_times)
        rpc_fwd_std = 1000.0 * np.std(rpc_fwd_times)
        embedding_fwd_mean = 1000.0 * np.mean(embedding_lookup_times)
        embedding_fwd_std = 1000.0 * np.std(embedding_lookup_times)

        print("[Trainer {}] Average FWD Time (ms): {}".format(
            args.distributed_rank, mean_fwd))
        print("[Trainer {}] STD DEV FWD Time (ms): {}".format(
            args.distributed_rank, std_fwd))
        print("[Trainer {}] Average BWD Time (ms): {}".format(
            args.distributed_rank, mean_bwd))
        print("[Trainer {}] STD DEV BWD Time (ms): {}".format(
            args.distributed_rank, std_bwd))
        print("[Trainer {}] Average RPC FWD Time (ms): {}".format(
            args.distributed_rank, rpc_fwd_mean))
        print("[Trainer {}] STD DEV RPC FWD Time (ms): {}".format(
            args.distributed_rank, rpc_fwd_std))
        print("[Trainer {}] Average Embedding Lookup Time (ms): {}".format(
            args.distributed_rank, embedding_fwd_mean))
        print("[Trainer {}] STD DEV Embedding Lookup Time (ms): {}".format(
            args.distributed_rank, embedding_fwd_std))

    # profiling
    if args.enable_profiling:
        with open("dlrm_s_pytorch.prof", "w") as prof_f:
            prof_f.write(prof.key_averages().table(sort_by="cpu_time_total"))
            prof.export_chrome_trace("./dlrm_s_pytorch.json")
Ejemplo n.º 2
0
    def test_dist_optim(self):
        # local version
        module1 = MyModule()
        module2 = MyModule()
        params = [module1.get_w(), module2.get_w()]
        local_optim = optim.SGD(params, lr=0.05)

        old_w1 = module1.w.clone().detach()
        old_w2 = module2.w.clone().detach()

        g_cpu = torch.Generator()
        g_cpu.manual_seed(0)
        t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
        t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
        output1 = module1.forward(t2)
        output2 = module2.forward(output1)
        loss = torch.add(output2, t1).sum()

        loss.backward()
        local_optim.step()

        # distributed version
        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)

        remote_module1 = rpc.remote(owner1, MyModule)
        remote_module2 = rpc.remote(owner2, MyModule)
        remote_param1 = remote_method(MyModule.get_w, remote_module1)
        remote_param2 = remote_method(MyModule.get_w, remote_module2)

        old_w1_remote = remote_param1.to_here()

        # sanity check: local and remote initial weights should match
        self.assertEqual(old_w1, remote_param1.to_here())
        self.assertEqual(old_w2, remote_param2.to_here())

        dist_optim = DistributedOptimizer(optim.SGD,
                                          [remote_param1, remote_param2],
                                          lr=0.05)

        with dist_autograd.context() as context_id:
            g_cpu.manual_seed(0)
            t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
            t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
            output1 = rpc_async_method(MyModule.forward, remote_module1, t2)
            output2 = rpc_async_method(MyModule.forward, remote_module2,
                                       output1.wait())
            loss = torch.add(output2.wait(), t1)

            dist_autograd.backward(context_id, [loss.sum()])
            dist_optim.step(context_id)

            new_w1 = rpc_async_method(MyModule.get_w, remote_module1).wait()
            new_w2 = rpc_async_method(MyModule.get_w, remote_module2).wait()

            # ensure optimizer changed weights
            self.assertNotEqual(old_w1, new_w1)
            self.assertNotEqual(old_w2, new_w2)
            # ensure local equals remote
            self.assertEqual(new_w1, module1.get_w())
            self.assertEqual(new_w2, module2.get_w())
    def test_ddp_dist_autograd_local_vs_remote_gpu(self):
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(
            backend="gloo",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        remote_layer1 = RemoteModule("worker0",
                                     device="cpu",
                                     module_cls=nn.Linear,
                                     args=(10, 7, False))
        layer1 = nn.Linear(10, 7, False)
        # Start with the same parameters for remote and local
        layer1.weight = remote_layer1.module_rref.to_here().weight

        layer2 = nn.Linear(7, 5).cuda(self.rank)
        ddp_layer2 = DistributedDataParallel(layer2, device_ids=[self.rank])

        remote_layer3 = RemoteModule("worker0",
                                     device="cpu",
                                     module_cls=nn.Linear,
                                     args=(5, 3, False))
        layer3 = nn.Linear(5, 3, False)
        # Start with the same parameters for remote and local
        layer3.weight = remote_layer3.module_rref.to_here().weight

        layer4 = nn.Linear(3, 1).cuda(self.rank)
        ddp_layer4 = DistributedDataParallel(layer4, device_ids=[self.rank])

        # Run local case.
        inputs = torch.rand((10, 10))
        loss = ddp_layer4(
            layer3(ddp_layer2(layer1(inputs).cuda(self.rank)).cpu()).cuda(
                self.rank)).sum()
        loss.backward()

        # Run remote case.
        with dist_autograd.context() as context_id:
            loss = ddp_layer4(
                remote_layer3(
                    ddp_layer2(remote_layer1(inputs).cuda(
                        self.rank)).cpu()).cuda(self.rank)).sum()
            dist_autograd.backward(context_id, [loss])
            grads_dict = dist_autograd.get_gradients(context_id)
            dist.barrier()
            self.assertEqual(
                layer1.weight.grad,
                rpc.rpc_sync(
                    "worker0",
                    DdpComparisonTest.get_remote_grads,
                    args=(remote_layer1.module_rref, context_id),
                ),
            )
            self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight])
            self.assertEqual(
                layer3.weight.grad,
                rpc.rpc_sync(
                    "worker0",
                    DdpComparisonTest.get_remote_grads,
                    args=(remote_layer3.module_rref, context_id),
                ),
            )
            self.assertEqual(layer4.weight.grad, grads_dict[layer4.weight])
Ejemplo n.º 4
0
def _run_trainer(remote_emb_module, rank):
    r"""
    Each trainer runs a forward pass which involves an embedding lookup on the
    parameter server and running nn.Linear locally. During the backward pass,
    DDP is responsible for aggregating the gradients for the dense part
    (nn.Linear) and distributed autograd ensures gradients updates are
    propagated to the parameter server.
    """

    # Setup the model.
    model = HybridModel(remote_emb_module, rank)

    # Retrieve all model parameters as rrefs for DistributedOptimizer.

    # Retrieve parameters for embedding table.
    model_parameter_rrefs = model.remote_emb_module.remote_parameters()

    # model.fc.parameters() only includes local parameters.
    # NOTE: Cannot call model.parameters() here,
    # because this will call remote_emb_module.parameters(),
    # which supports remote_parameters() but not parameters().
    for param in model.fc.parameters():
        model_parameter_rrefs.append(RRef(param))

    # Setup distributed optimizer
    opt = DistributedOptimizer(
        optim.SGD,
        model_parameter_rrefs,
        lr=0.05,
    )

    criterion = torch.nn.CrossEntropyLoss()

    def get_next_batch(rank):
        for _ in range(10):
            num_indices = random.randint(20, 50)
            indices = torch.LongTensor(num_indices).random_(0, NUM_EMBEDDINGS)

            # Generate offsets.
            offsets = []
            start = 0
            batch_size = 0
            while start < num_indices:
                offsets.append(start)
                start += random.randint(1, 10)
                batch_size += 1

            offsets_tensor = torch.LongTensor(offsets)
            target = torch.LongTensor(batch_size).random_(8).cuda(rank)
            yield indices, offsets_tensor, target

    # Train for 100 epochs
    for epoch in range(100):
        # create distributed autograd context
        for indices, offsets, target in get_next_batch(rank):
            with dist_autograd.context() as context_id:
                output = model(indices, offsets)
                loss = criterion(output, target)

                # Run distributed backward pass
                dist_autograd.backward(context_id, [loss])

                # Tun distributed optimizer
                opt.step(context_id)

                # Not necessary to zero grads as each iteration creates a different
                # distributed autograd context which hosts different grads
        print("Training done for epoch {}".format(epoch))
Ejemplo n.º 5
0
 def dist_backward_script(context_id: int, loss: torch.Tensor):
     dist_autograd.backward(context_id, [loss])
Ejemplo n.º 6
0
def study():
    """
    Async multiplication using two remote modules
    """
    # Start with a local version
    module1 = MyModule()
    module2 = MyModule()
    params = [module1.get_w(), module2.get_w()]
    local_optim = optim.SGD(params, lr=0.05)

    # Keep a copy of the old weights to make sure they change
    old_w1 = module1.w.clone().detach()
    old_w2 = module2.w.clone().detach()

    torch.manual_seed(0)
    t1 = torch.rand((3, 3), requires_grad=True)
    t2 = torch.rand((3, 3), requires_grad=True)

    output1 = module1.forward(t2)
    output2 = module2.forward(output1)
    loss = torch.add(output2, t1).sum()

    loss.backward()
    local_optim.step()

    # distributed version
    owner1 = "worker%d" % ((Env.rank + 1) % Env.world_size)
    owner2 = "worker%d" % ((Env.rank + 2) % Env.world_size)

    remote_module1 = rpc.remote(owner1, MyModule)
    remote_module2 = rpc.remote(owner2, MyModule)
    remote_param1 = remote_method(MyModule.get_w, remote_module1)
    remote_param2 = remote_method(MyModule.get_w, remote_module2)

    old_w1_remote = remote_param1.to_here()

    dist_optim = DistributedOptimizer(
        optim.SGD, [remote_param1, remote_param2], lr=0.05
    )

    with dist_autograd.context():
        torch.manual_seed(0)
        t1 = torch.rand((3, 3), requires_grad=True)
        t2 = torch.rand((3, 3), requires_grad=True)

        output1 = remote_async(MyModule.forward, remote_module1, t2)
        output2 = remote_async(MyModule.forward, remote_module2, output1.wait())
        loss = torch.add(output2.wait(), t1)

        dist_autograd.backward([loss.sum()])
        dist_optim.step()

        new_w1 = remote_async(MyModule.get_w, remote_module1).wait()
        new_w2 = remote_async(MyModule.get_w, remote_module2).wait()

        # Make sure the weights have been updated
        print(f'Old weight vs new weight: {old_w1 == new_w1}')
        print(f'Old weight vs new weight: {old_w2 == new_w2}')

        # Make sure the weights on the remote module and the local copy are the same
        w1_consistent = (new_w1 == module1.get_w()).all()
        w2_consistent = (new_w2 == module2.get_w()).all()

        print(f'w1 consist: {w1_consistent}')
        print(f'w2 consist: {w2_consistent}')