def test_nested_input(setup_rpc): class NestedInput(nn.Module): def __init__(self): super().__init__() self.fc_a = nn.Linear(1, 1) self.fc_b = nn.Linear(1, 1) def forward(self, inp): return inp model = nn.Sequential(NestedInput()) model = Pipe(model, chunks=2) a = torch.rand(10, 1, requires_grad=True) b = torch.rand(10, 1, requires_grad=True) # TypeError: expected Tensor, but got tuple with pytest.raises(TypeError): model((a, (a, b))).local_value() # TypeError: expected Tensor, but got list with pytest.raises(TypeError): model((a, [a, b])).local_value()
def test_checkpoint_eval(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, chunks=2) input = torch.rand(2, 1) def find_grad_fn(grad_fn, name): if grad_fn is None: return False if grad_fn.__class__.__name__ == name: return True for next_grad_fn, _ in grad_fn.next_functions: if find_grad_fn(next_grad_fn, name): return True return False model.train() train_output = model(input) assert find_grad_fn(train_output.local_value().grad_fn, "CheckpointBackward") assert find_grad_fn(train_output.local_value().grad_fn, "RecomputeBackward") model.eval() eval_output = model(input) assert not find_grad_fn(eval_output.local_value().grad_fn, "CheckpointBackward") assert not find_grad_fn(eval_output.local_value().grad_fn, "RecomputeBackward")
def test_verify_module_duplicate_children(setup_rpc): conv = nn.Conv2d(3, 3, 1) model = nn.Sequential(conv, conv) with pytest.raises(ValueError, match="module with duplicate children is not supported"): Pipe(model)
def test_verify_module_non_sequential(setup_rpc): with pytest.raises(TypeError, match="module must be nn.Sequential to be partitioned"): Pipe(nn.Module())
def test_deny_moving(setup_rpc): a = nn.Linear(1, 1) b = nn.Linear(1, 1) model = nn.Sequential(a, b) model = Pipe(model) # Moving is denied. with pytest.raises(TypeError): model.cuda() with pytest.raises(TypeError): model.cpu() with pytest.raises(TypeError): model.to(torch.device("cuda")) with pytest.raises(TypeError): model.to(0) with pytest.raises(TypeError): model.to("cuda") with pytest.raises(TypeError): model.to(device=0) with pytest.raises(TypeError): model.to(torch.rand(1)) with pytest.raises(TypeError): model.to(tensor=torch.rand(1)) # Casting is allowed. model.half() model.to(torch.double) model.to(dtype=torch.float)
def test_parameters(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) pipe = Pipe(model, chunks=1) assert list(pipe.parameters()) != []
def test_pipe_without_rpc(): model = nn.Sequential(nn.Linear(1, 1)) with pytest.raises(RuntimeError, match='Please initialize RPC framework'): pipe = Pipe(model, chunks=1)
def test_checkpoint_mode_invalid(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) with pytest.raises(ValueError, match="checkpoint is not one of 'always', 'except_last', or 'never'"): Pipe(model, chunks=2, checkpoint="INVALID_CHECKPOINT")
def test_delete_portal_tensor(train, checkpoint, setup_rpc): # Without checkpointing: # +- Stash --+ +--- Pop ----+ - - - layers # | 2,blue,1 |--| 1,orange,0 | - - - tensor_life and portal function # +----------+ +------------+ # # With checkpointing: # +- Stash --+ +--- Pop ----+ +--- Pop'----+ +- Stash'--+ # | 3,blue,2 |--| 2,orange,1 |--| 1,orange,0 |--| 1,blue,0 | # +----------+ +------------+ +------------+ +----------+ def portal_tensor_life_is(tensor_life, skip_tracker=None): if skip_tracker is None: skip_tracker = current_skip_tracker() # Get the current portal. portal = list(skip_tracker.portals.values())[0] if tensor_life == 0: return portal.tensor_life == 0 and portal.tensor is None else: return portal.tensor_life == tensor_life and portal.tensor is not None # Check the portal tensor after 'Stash'. stash_ = Stash() @stash_.register_forward_hook def check_portal_tensor_after_stash(*_): if is_checkpointing(): assert portal_tensor_life_is(2) elif is_recomputing(): assert portal_tensor_life_is(0) else: assert portal_tensor_life_is(1) pop_ = Pop() @pop_.register_forward_hook def check_portal_tensor_after_pop(*_): if is_checkpointing(): assert portal_tensor_life_is(1) elif is_recomputing(): assert portal_tensor_life_is(0) else: assert portal_tensor_life_is(0) class NoPortalTensorAtBackward(nn.Module): class F(torch.autograd.Function): @staticmethod def forward(ctx, input): ctx.skip_tracker = current_skip_tracker() return input.detach() @staticmethod def backward(ctx, grad): assert portal_tensor_life_is(0, skip_tracker=ctx.skip_tracker) return grad def forward(self, input): return self.F.apply(input) model = nn.Sequential(NoPortalTensorAtBackward(), stash_, pop_) model = Pipe(model, chunks=2, checkpoint=checkpoint) input = torch.rand(10, requires_grad=True) if train: model.train() output = model(input).local_value() output.norm().backward() else: model.eval() with torch.no_grad(): model(input)
transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout) if i != 0 and i % (partition_len) == 0: module_list.append(nn.Sequential(*tmp_list)) tmp_list = [] device = i // (partition_len) tmp_list.append(transformer_block.to(device)) # Add decoder in the end. tmp_list.append(Decoder(ntokens, emsize).cuda(num_gpus - 1)) module_list.append(nn.Sequential(*tmp_list)) from torch.distributed.pipeline.sync import Pipe # Build the pipeline. chunks = 8 model = Pipe(torch.nn.Sequential(*module_list), chunks=chunks) def get_total_params(module: torch.nn.Module): total_params = 0 for param in module.parameters(): total_params += param.numel() return total_params print('Total parameters in model: {:,}'.format(get_total_params(model))) ###################################################################### # Run the model # ------------- #
def run_worker(rank, world_size): ###################################################################### # Load and batch data # ------------------- # ###################################################################### # The training process uses Wikitext-2 dataset from ``torchtext``. The # vocab object is built based on the train dataset and is used to numericalize # tokens into tensors. Starting from sequential data, the ``batchify()`` # function arranges the dataset into columns, trimming off any tokens remaining # after the data has been divided into batches of size ``batch_size``. # For instance, with the alphabet as the sequence (total length of 26) # and a batch size of 4, we would divide the alphabet into 4 sequences of # length 6: # # .. math:: # \begin{bmatrix} # \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z} # \end{bmatrix} # \Rightarrow # \begin{bmatrix} # \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} & # \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & # \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & # \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} # \end{bmatrix} # # These columns are treated as independent by the model, which means that # the dependence of ``G`` and ``F`` can not be learned, but allows more # efficient batch processing. # # In 'run_worker' def print_with_rank(msg): print('[RANK {}]: {}'.format(rank, msg)) from torchtext.datasets import WikiText2 from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator train_iter = WikiText2(split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) def data_process(raw_text_iter): data = [ torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter ] return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) train_iter, val_iter, test_iter = WikiText2() train_data = data_process(train_iter) val_data = data_process(val_iter) test_data = data_process(test_iter) device = torch.device(2 * rank) def batchify(data, bsz, rank, world_size, is_train=False): # Divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() # Divide the data across the ranks only for training data. if is_train: data_per_rank = data.size(0) // world_size data = data[rank * data_per_rank:(rank + 1) * data_per_rank] return data.to(device) batch_size = 20 eval_batch_size = 10 train_data = batchify(train_data, batch_size, rank, world_size, True) val_data = batchify(val_data, eval_batch_size, rank, world_size) test_data = batchify(test_data, eval_batch_size, rank, world_size) ###################################################################### # Functions to generate input and target sequence # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ###################################################################### # ``get_batch()`` function generates the input and target sequence for # the transformer model. It subdivides the source data into chunks of # length ``bptt``. For the language modeling task, the model needs the # following words as ``Target``. For example, with a ``bptt`` value of 2, # we’d get the following two Variables for ``i`` = 0: # # .. image:: ../_static/img/transformer_input_target.png # # It should be noted that the chunks are along dimension 0, consistent # with the ``S`` dimension in the Transformer model. The batch dimension # ``N`` is along dimension 1. # # In 'run_worker' bptt = 35 def get_batch(source, i): seq_len = min(bptt, len(source) - 1 - i) data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].view(-1) # Need batch dimension first for pipeline parallelism. return data.t(), target ###################################################################### # Model scale and Pipe initialization # ----------------------------------- # ###################################################################### # To demonstrate training large Transformer models using pipeline parallelism, # we scale up the Transformer layers appropriately. We use an embedding # dimension of 4096, hidden size of 4096, 16 attention heads and 8 total # transformer layers (``nn.TransformerEncoderLayer``). This creates a model with # **~1 billion** parameters. # # We need to initialize the `RPC Framework <https://pytorch.org/docs/stable/rpc.html>`__ # since Pipe depends on the RPC framework via `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__ # which allows for future expansion to cross host pipelining. We need to # initialize the RPC framework with only a single worker since we're using a # single process to drive multiple GPUs. # # The pipeline is then initialized with 8 transformer layers on one GPU and 8 # transformer layers on the other GPU. One pipe is setup across GPUs 0 and 1 and # another across GPUs 2 and 3. Both pipes are then replicated using DistributedDataParallel. # In 'run_worker' ntokens = len(vocab) # the size of vocabulary emsize = 4096 # embedding dimension nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 8 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 16 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value from torch.distributed import rpc tmpfile = tempfile.NamedTemporaryFile() rpc.init_rpc( name="worker", rank=0, world_size=1, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method="file://{}".format(tmpfile.name), # Specifying _transports and _channels is a workaround and we no longer # will have to specify _transports and _channels for PyTorch # versions >= 1.8.1 _transports=["ibv", "uv"], _channels=["cuda_ipc", "cuda_basic"], )) # Num gpus for model parallelism. num_gpus = 2 partition_len = ((nlayers - 1) // num_gpus) + 1 # Add encoder in the beginning. tmp_list = [Encoder(ntokens, emsize, dropout).cuda(2 * rank)] module_list = [] # Add all the necessary transformer blocks. for i in range(nlayers): transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout) if i != 0 and i % (partition_len) == 0: module_list.append(nn.Sequential(*tmp_list)) tmp_list = [] device = i // (partition_len) tmp_list.append(transformer_block.to(2 * rank + device)) # Add decoder in the end. tmp_list.append(Decoder(ntokens, emsize).cuda(2 * rank + num_gpus - 1)) module_list.append(nn.Sequential(*tmp_list)) # Need to use 'checkpoint=never' since as of PyTorch 1.8, Pipe checkpointing # doesn't work with DDP. from torch.distributed.pipeline.sync import Pipe chunks = 8 model = Pipe(torch.nn.Sequential(*module_list), chunks=chunks, checkpoint="never") # Initialize process group and wrap model in DDP. from torch.nn.parallel import DistributedDataParallel import torch.distributed as dist os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) model = DistributedDataParallel(model) def get_total_params(module: torch.nn.Module): total_params = 0 for param in module.parameters(): total_params += param.numel() return total_params print_with_rank('Total parameters in model: {:,}'.format( get_total_params(model))) ###################################################################### # Run the model # ------------- # ###################################################################### # `CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__ # is applied to track the loss and # `SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__ # implements stochastic gradient descent method as the optimizer. The initial # learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is # applied to adjust the learn rate through epochs. During the # training, we use # `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__ # function to scale all the gradient together to prevent exploding. # # In 'run_worker' criterion = nn.CrossEntropyLoss() lr = 5.0 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) import time def train(): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() ntokens = len(vocab) # Train only for 50 batches to keep script execution time low. nbatches = min(50 * bptt, train_data.size(0) - 1) for batch, i in enumerate(range(0, nbatches, bptt)): data, targets = get_batch(train_data, i) optimizer.zero_grad() # Since the Pipe is only within a single host and process the ``RRef`` # returned by forward method is local to this node and can simply # retrieved via ``RRef.local_value()``. output = model(data).local_value() # Need to move targets to the device where the output of the # pipeline resides. loss = criterion(output.view(-1, ntokens), targets.cuda(2 * rank + 1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() total_loss += loss.item() log_interval = 10 if batch % log_interval == 0 and batch > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print_with_rank('| epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, nbatches // bptt, scheduler.get_last_lr()[0], elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() def evaluate(eval_model, data_source): eval_model.eval() # Turn on the evaluation mode total_loss = 0. ntokens = len(vocab) # Evaluate only for 50 batches to keep script execution time low. nbatches = min(50 * bptt, data_source.size(0) - 1) with torch.no_grad(): for i in range(0, nbatches, bptt): data, targets = get_batch(data_source, i) output = eval_model(data).local_value() output_flat = output.view(-1, ntokens) # Need to move targets to the device where the output of the # pipeline resides. total_loss += len(data) * criterion( output_flat, targets.cuda(2 * rank + 1)).item() return total_loss / (len(data_source) - 1) ###################################################################### # Loop over epochs. Save the model if the validation loss is the best # we've seen so far. Adjust the learning rate after each epoch. # In 'run_worker' best_val_loss = float("inf") epochs = 3 # The number of epochs best_model = None for epoch in range(1, epochs + 1): epoch_start_time = time.time() train() val_loss = evaluate(model, val_data) print_with_rank('-' * 89) print_with_rank( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print_with_rank('-' * 89) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model scheduler.step() ###################################################################### # Evaluate the model with the test dataset # ------------------------------------- # # Apply the best model to check the result with the test dataset. # In 'run_worker' test_loss = evaluate(best_model, test_data) print_with_rank('=' * 89) print_with_rank( '| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print_with_rank('=' * 89)
def _run_basic_test(self, backend, checkpoint, find_unused_parameters=False): dist.init_process_group( backend="nccl", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) # Use 4 GPUs, two replicas of a pipe across GPU 0 and 1 and another # pipe between GPU 2 and 3. Both replicas are replicated via DDP. fc1 = nn.Linear(16, 8, bias=False).cuda(2 * self.rank) class MyModule(nn.Module): def __init__(self, device): super(MyModule, self).__init__() self.fc2 = nn.Linear(8, 4, bias=False).cuda(device) self.fc3 = nn.Linear(4, 2, bias=False).cuda(device) def forward(self, inp): if find_unused_parameters: return self.fc2(inp) else: return self.fc3(self.fc2(inp)) layer2 = MyModule(2 * self.rank + 1) model = nn.Sequential(fc1, layer2) model = Pipe(model, chunks=2, checkpoint=checkpoint) model = DistributedDataParallel( model, find_unused_parameters=find_unused_parameters) out = model(torch.rand(16, 16).cuda(2 * self.rank)).local_value() out.sum().backward() # Run forward again for find_unused_parameters to trigger any potential errors. if find_unused_parameters: model(torch.rand(16, 16).cuda(2 * self.rank)) # Check grads output = [ torch.empty_like(fc1.weight.grad), torch.empty_like(fc1.weight.grad) ] dist.all_gather(output, fc1.weight.grad) self.assertEqual(output[0], output[1]) output = [ torch.empty_like(layer2.fc2.weight.grad), torch.empty_like(layer2.fc2.weight.grad) ] dist.all_gather(output, layer2.fc2.weight.grad) self.assertEqual(output[0], output[1]) if not find_unused_parameters: output = [ torch.empty_like(layer2.fc3.weight.grad), torch.empty_like(layer2.fc3.weight.grad) ] dist.all_gather(output, layer2.fc3.weight.grad) self.assertEqual(output[0], output[1])
def _run_basic_test(self, backend, checkpoint, find_unused_parameters=False, static_graph=False): dist.init_process_group( backend=backend, init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) # Use 4 GPUs, two replicas of a pipe across GPU 0 and 1 and another # pipe between GPU 2 and 3. Both replicas are replicated via DDP. fc1 = nn.Linear(16, 8, bias=False).cuda(2 * self.rank) class MyModule(nn.Module): def __init__(self, device): super(MyModule, self).__init__() self.fc2 = nn.Linear(8, 4, bias=False).cuda(device) self.fc3 = nn.Linear(4, 2, bias=False).cuda(device) def forward(self, inp): if find_unused_parameters: return self.fc2(inp) else: return self.fc3(self.fc2(inp)) layer2 = MyModule(2 * self.rank + 1) model = nn.Sequential(fc1, layer2) model = Pipe(model, chunks=2, checkpoint=checkpoint) model = DistributedDataParallel( model, find_unused_parameters=find_unused_parameters, static_graph=static_graph, ) # Ensure inputs are different across ranks to verify that gradient # sync indeed occurs. model_input = torch.rand(16, 16).cuda(2 * self.rank) * (self.rank + 1) out = model(model_input).local_value() out.sum().backward() # Run forward again for find_unused_parameters to trigger any potential errors. if find_unused_parameters: # Ensure inputs are different across ranks to verify that gradient # sync indeed occurs. unused_param_input = torch.rand(16, 16).cuda( 2 * self.rank) * (self.rank + 1) model(unused_param_input).local_value().sum().backward() # Run a few more iterations of fwd + bwd to ensure gradient synchronization # occurs properly across iterations via delay_all_reduce/bucketized allreduce. for _ in range(3): model_input = torch.rand(16, 16).cuda( 2 * self.rank) * (self.rank + 1) out = model(model_input).local_value() out.sum().backward() # Check grads output = [ torch.empty_like(fc1.weight.grad), torch.empty_like(fc1.weight.grad) ] dist.all_gather(output, fc1.weight.grad) self.assertEqual(output[0], output[1]) output = [ torch.empty_like(layer2.fc2.weight.grad), torch.empty_like(layer2.fc2.weight.grad) ] dist.all_gather(output, layer2.fc2.weight.grad) self.assertEqual(output[0], output[1]) if not find_unused_parameters: output = [ torch.empty_like(layer2.fc3.weight.grad), torch.empty_like(layer2.fc3.weight.grad) ] dist.all_gather(output, layer2.fc3.weight.grad) self.assertEqual(output[0], output[1])