def run(rank, world_size):
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "10638"
    dist_init(rank, world_size)
    os.environ["MASTER_PORT"] = "10639"
    dist.rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size)
    initialize_model_parallel(1, world_size)

    model = get_model()
    data, target = get_data()[0]
    loss_fn = get_loss_fun()

    device = torch.device("cuda",
                          rank) if DEVICE == "cuda" else torch.device("cpu")

    model = MultiProcessPipe(
        model,
        balance=[2, 1],
        style=MultiProcessPipe.MultiProcess,
        worker_map={
            0: "worker0",
            1: "worker1"
        },  # Needed to convert ranks to RPC worker names
        input_device=device,
    ).to(device)

    # define optimizer and loss function
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    # zero the parameter gradients
    optimizer.zero_grad()

    # outputs and target need to be on the same device
    # forward step
    outputs = model(data.to(device))
    # compute loss
    if rank == 1:
        loss = loss_fn(outputs.to(device), target.to(device))

        # backward + optimize
        loss.backward()
        optimizer.step()
    else:
        model.back_helper(outputs)

    print(f"Finished Training Step on {rank}")
    dist.rpc.shutdown()

    del model
Esempio n. 2
0
def run(rank, world_size):
    torch_pg.init_mpi()
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "10638"
    dist_init(rank, world_size)  # FIXME (supports gloo)
    os.environ["MASTER_PORT"] = "10639"
    torch.distributed.rpc.init_rpc(f"worker{rank}",
                                   rank=rank,
                                   world_size=world_size)
    initialize_model_parallel(1, world_size, pipeline_backend="mpi")

    if rank == 1:
        # For RPC, all ranks other than 0 just need to call rpc.shutdown()
        torch.distributed.rpc.shutdown()
        return

    model = getModel()
    data, target = getData()[0]
    loss_fn = getLossFun()

    device = torch.device("cuda", rank)

    model = fairscale.nn.PipeRPCWrapper(
        model,
        balance=[2, 1],
        worker_map={
            0: "worker0",
            1: "worker1"
        },  # Needed to convert ranks to RPC worker names
        input_device=device,
    ).to(device)

    # We can't directly access the model on each worker, so we need to call
    # foreach_worker with a callback to setup the optimizer
    model.foreach_worker(register_optimizer, {"lr": 0.001}, include_self=True)

    outputs = model(data.to(device))
    loss = loss_fn(outputs.to(device), target.to(device))
    loss.backward()

    # Same as earlier, use foreach_worker to step the optimizer on each rank
    model.foreach_worker(run_optimizer, include_self=True)

    print(f"Finished Training Step on {rank}")

    torch.distributed.rpc.shutdown()

    del model
Esempio n. 3
0
def train(rank: int, world_size: int, epochs: int, use_oss: bool):

    # DDP
    dist_init(rank, world_size)
    rank = torch.device("cpu") if DEVICE == "cpu" else rank

    # Problem statement
    model = getModel().to(rank)
    dataloader = getData(n_batches=1)
    loss_fn = getLossFun()

    optimizer: Optional[Union[OSS, torch.optim.SGD]] = None

    if not use_oss:
        optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-4)
    else:
        base_optimizer = torch.optim.SGD
        base_optimizer_arguments = {
            "lr": 1e-4
        }  # any optimizer specific arguments, LR, momentum, etc...
        optimizer = OSS(params=model.parameters(),
                        optim=base_optimizer,
                        **base_optimizer_arguments)

    training_start = time.monotonic()
    # Any relevant training loop, nothing specific to OSS. For example:
    model.train()

    for _ in range(epochs):
        for (data, target) in dataloader:
            data, target = data.to(rank), target.to(rank)

            # Train
            model.zero_grad()
            outputs = model(data)
            loss = loss_fn(outputs, target)
            loss.backward()

            # if you want to clip the gradients / get the current max:
            max_norm = 1000.0
            norm_type = 1
            if not use_oss:
                _total_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), max_norm,
                    norm_type=norm_type)  # type: ignore
            else:
                optimizer = cast(OSS, optimizer)
                _total_norm = optimizer.clip_grad_norm(max_norm,
                                                       norm_type=norm_type)

            optimizer.step()

            print(f"Loss: {loss.item()}")

    training_end = time.monotonic()
    print(
        f"[{dist.get_rank()}] : Training done. {training_end-training_start:.2f} sec"
    )

    if DEVICE == "cuda":
        max_memory = torch.cuda.max_memory_allocated(rank)
        print(f"[{dist.get_rank()}] : Peak memory {max_memory:.1f}MiB")