Esempi in Python per ShardedDataParallel.reduce

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: fairscale.nn.data_parallel

Classe/tipologia: ShardedDataParallel

Metodo/funzione: reduce

Esempi su hotexamples.com: 3

ShardedDataParallel.reduce in Python: 3 esempi trovati. Questi sono i migliori esempi reali in Python per fairscale.nn.data_parallel.ShardedDataParallel.reduce, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

ShardedDataParallel(30)

parameters(8)

train(8)

buffers(3)

eval(3)

reduce(3)

to(3)

no_sync(2)

zero_grad(2)

register_comm_hook(1)

Esempio n. 1

Mostra file

File: test_sharded_ddp.py Progetto: lly-zero-one/fairscale

def run_one_step(rank, world_size, backend, device, temp_file_name):
    url = "file://" + temp_file_name
    dist.init_process_group(init_method=url, backend=backend, rank=rank, world_size=world_size)
    if device == torch.device("cuda"):
        torch.cuda.set_device(rank)

    model = Sequential(Linear(2, 3), Linear(3, 4)).to(device)

    ddp = ShardedDataParallel(
        module=model, optimizer=torch.optim.SGD, optimizer_params={"lr": 0.1, "momentum": 0.99}, world_size=world_size
    )
    optimizer = ddp.optimizer

    input_tensor = torch.rand((64, 2)).to(device)
    output = ddp(input_tensor).abs().sum() / input_tensor.numel()
    output.backward()
    ddp.reduce()

    # Check that all the grads have been populated, for the shard
    if device == torch.device("cuda"):
        torch.cuda.synchronize()  # flush any remaining cuda op, just in case

    for pg in optimizer.optim.param_groups:
        for param in pg["params"]:
            if param.requires_grad:
                assert param.grad.abs().sum().item() > 0.0, "The reduce step should have populated all the gradients"

    # Check that the optimization process makes sense (ie. loss goes down for the same data)
    optimizer.step()
    new_eval = ddp(input_tensor).abs().sum() / input_tensor.numel()

Esempio n. 2

Mostra file

def run_one_step(rank, world_size, backend, device, temp_file_name):
    url = "file://" + temp_file_name
    dist.init_process_group(init_method=url,
                            backend=backend,
                            rank=rank,
                            world_size=world_size)
    if device == torch.device("cuda"):
        torch.cuda.set_device(rank)

    # Any model works. Add one different buffer per rank
    model = Sequential(Linear(2, 3)).to(device)
    model.register_buffer("test_buffer", torch.ones((1)) * rank)

    def weights_init(m):
        if isinstance(m, Linear):
            torch.nn.init.constant_(m.weight.data, 1.0)
            torch.nn.init.constant_(m.bias.data, 1.0)

    model.apply(weights_init)
    model.to(device)

    ddp = ShardedDataParallel(
        module=model,
        optimizer=torch.optim.SGD,
        optimizer_params={
            "lr": 0.01,
            "momentum": 0.99
        },
        world_size=world_size,
        broadcast_buffers=True,
    )
    optimizer = ddp.optimizer
    model = ddp.module

    # Different input per rank, allows for checking that the gradients have been properly reduced
    input_tensor = (torch.ones((64, 2)) * rank).to(device)
    output = ddp(input_tensor).abs().sum()
    output.backward()
    ddp.reduce()

    # Check that all the grads have been populated, for the shard
    for pg in optimizer.optim.param_groups:
        for param in pg["params"]:
            if param.shape == torch.Size([3, 2]):
                assert param.grad[0, 0].cpu() == torch.tensor([32.0])
            if param.shape == torch.Size([3]):
                assert param.grad[0].cpu() == torch.tensor([64.0])

    # Check that all the buffers are in sync (authoritative rank is 0, its buffer is 0)
    for b in model.buffers():
        assert b.cpu().item() == 0.0

    dist.destroy_process_group()

Esempio n. 3

Mostra file

File: test_sharded_ddp.py Progetto: joshim5/fairscale

def run_one_step(rank, world_size, backend, device, temp_file_name):
    url = "file://" + temp_file_name
    dist.init_process_group(init_method=url,
                            backend=backend,
                            rank=rank,
                            world_size=world_size)
    if device == torch.device("cuda"):
        torch.cuda.set_device(rank)

    # Any model works. Add one different buffer per rank
    model = Sequential(Linear(2, 3), Linear(3, 4)).to(device)
    model.register_buffer("test_buffer", torch.ones((1)) * rank)
    model.to(device)

    ddp = ShardedDataParallel(
        module=model,
        optimizer=torch.optim.SGD,
        optimizer_params={
            "lr": 0.01,
            "momentum": 0.99
        },
        world_size=world_size,
        broadcast_buffers=True,
    )
    optimizer = ddp.optimizer
    model = ddp.module

    input_tensor = torch.rand((64, 2)).to(device)
    output = ddp(input_tensor).abs().sum() / input_tensor.numel()
    output.backward()
    ddp.reduce()

    # Check that all the grads have been populated, for the shard
    if device == torch.device("cuda"):
        torch.cuda.synchronize()  # flush any remaining cuda op, just in case

    for pg in optimizer.optim.param_groups:
        for param in pg["params"]:
            if param.requires_grad:
                assert param.grad.abs().sum().item(
                ) > 0.0, "The reduce step should have populated all the gradients"

    # Check that all the buffers are in sync (authoritative rank is 0, its buffer is 0)
    for b in model.buffers():
        assert b.cpu().item() == 0.0