def test_simple_linears(): def sum_grad(parameters): return sum([p.grad.sum() for p in parameters if p.grad is not None]) def zero_grad(parameters): for p in parameters: p.grad = None inputs = torch.rand(8, 1) model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),) # Without Pipe outputs = model(inputs) loss = outputs.mean() loss.backward() grad_without_pipe = sum_grad(model.parameters()) zero_grad(model.parameters()) # With Pipe model = Pipe(model, [2, 2], devices=["cpu", "cpu"], chunks=4) outputs = model(inputs) loss = outputs.mean() loss.backward() grad_with_pipe = sum_grad(model.parameters()) # Both grads should be identical. assert torch.allclose(grad_with_pipe, grad_without_pipe)
def _train_pipe_model(model, use_fp16=False, checkpoint="never", chunks=1): model = copy.deepcopy(model) model = Pipe( model, balance=[1] * torch.cuda.device_count(), devices=list(range(torch.cuda.device_count())), chunks=chunks, checkpoint=checkpoint, ) optimizer = torch.optim.SGD(model.parameters(), lr=0.001) return _train(model, optimizer, use_fp16)
def simple_linears(pipeline_style): def sum_grad(parameters): return sum([p.grad.sum() for p in parameters if p.grad is not None]) def zero_grad(parameters): for p in parameters: p.grad = None set_random_seed(12345) inputs = torch.rand(8, 1) model = nn.Sequential( nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1), ) # Without Pipe outputs = model(inputs) loss = outputs.mean() loss.backward() grad_without_pipe = [ sum_grad([*model[0].parameters(), *model[1].parameters()]), sum_grad([*model[2].parameters(), *model[3].parameters()]), ] ref_without_pipe = [p.grad for p in model.parameters()] zero_grad(model.parameters()) # With Pipe model = Pipe(model, [2, 2], style=pipeline_style, worker_map=get_worker_map(), chunks=4) outputs = model(inputs) if model.group.rank() == 1: loss = outputs.mean() loss.backward() grad_with_pipe = sum_grad( model.pipeline.mp_partitions[0].module.parameters()) # Both grads should be identical. assert torch.allclose(grad_with_pipe, grad_without_pipe[1]) else: model.back_helper(outputs) grad_with_pipe = sum_grad( model.pipeline.mp_partitions[0].module.parameters()) # Both grads should be identical. assert torch.allclose(grad_with_pipe, grad_without_pipe[0]) torch.distributed.barrier()
def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint): try: from fairscale.nn import Pipe except ImportError: raise ImportError( 'Please install fairscale with: pip install fairscale') super().__init__() assert isinstance(encoder, FairseqEncoder) assert isinstance(decoder, FairseqDecoder) module_list = nn.Sequential( encoder.embedding_layer, *list(encoder.encoder_layers), encoder.final_layer_norm, decoder.embedding_layer, *list(decoder.decoder_layers), decoder.decoder_output_layer, ) self.devices = devices self.model = Pipe( module_list, balance=balance, devices=devices, chunks=chunks, checkpoint=checkpoint, ) self.encoder_max_positions = self.max_positions_helper( encoder.embedding_layer, 'max_source_positions') self.decoder_max_positions = self.max_positions_helper( decoder.embedding_layer, 'max_target_positions') self.adaptive_softmax = getattr(decoder, 'adaptive_softmax', None)
def init_components( self, model_fn=None, criterion_fn=None, optimizer_fn=None, scheduler_fn=None, ): """Inits the runs components.""" model = model_fn() if "balance" not in self.pipe_kwargs: warnings.warn( "With FairScale Pipe setup, " "you need to specify ``balance`` under ``pipe_kwargs``. " "Generating balance automatically. (Experimental feature)") self.pipe_kwargs["balance"] = _generate_balance( self.device_count, len(model)) pipe_model = Pipe(model, **self.pipe_kwargs) del model # criterion criterion = criterion_fn() # optimizer optimizer = optimizer_fn(pipe_model) # scheduler scheduler = scheduler_fn() return pipe_model, criterion, optimizer, scheduler
def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) try: from fairscale.nn import Pipe except ImportError: raise ImportError("Please install fairscale with: pip install fairscale") self.use_pipeline = encoder_module_list is not None if not self.use_pipeline: self.embedding_layer = TransformerEncoderEmbedding(args, embed_tokens) self.encoder_layers = nn.Sequential(*[TransformerEncoderLayer(args) for i in range(args.encoder_layers)]) if isinstance(embed_tokens, nn.ModuleList): emb_dim = sum(e.embedding_dim for e in embed_tokens) else: emb_dim = embed_tokens.embedding_dim self.final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim) else: encoder_balance = utils.eval_str_list( args.pipeline_encoder_balance, type=int ) encoder_devices = utils.eval_str_list( args.pipeline_encoder_devices, type=int ) assert sum(encoder_balance) == len(encoder_module_list), ( f"Sum of encoder_balance={encoder_balance} is not equal " + f"to num_encoder_modules={len(encoder_module_list)}" ) self.model = Pipe( module=nn.Sequential(*encoder_module_list), balance=encoder_balance, devices=encoder_devices, chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, )
def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint): try: from fairscale.nn import Pipe except ImportError: raise ImportError( "Please install fairscale with: pip install fairscale") super().__init__() assert isinstance(encoder, FairseqEncoder) assert isinstance(decoder, FairseqDecoder) encoder_module_list = ([encoder.embedding_layer] + list(encoder.encoder_layers) + [encoder.final_layer_norm]) self.num_encoder_modules = len(encoder_module_list) decoder_module_list = ([decoder.embedding_layer] + list(decoder.decoder_layers) + [decoder.decoder_output_layer]) self.num_decoder_modules = len(decoder_module_list) module_list = encoder_module_list + decoder_module_list self.devices = devices self.model = Pipe( nn.Sequential(*module_list), balance=balance, devices=devices, chunks=chunks, checkpoint=checkpoint, ) self.encoder_max_positions = self.max_positions_helper( encoder.embedding_layer, "max_source_positions") self.decoder_max_positions = self.max_positions_helper( decoder.embedding_layer, "max_target_positions") self.adaptive_softmax = getattr(decoder, "adaptive_softmax", None) # Note: To be populated during inference self.encoder = None self.decoder = None
def benchmark_single_process(args): """Benchmark a given model using a single process and multiple devices.""" init_method_pgroup = "tcp://localhost:{}".format(MPI_PORT) torch.distributed.init_process_group(backend="gloo", rank=0, world_size=1, init_method=init_method_pgroup) num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1 assert num_devices > 0 init_random_seed(0) benchmark_config = create_benchmark_config(args.model_name) model_specs = get_model_specs(args.model_name) model_config = create_model_config(args, benchmark_config=benchmark_config, model_specs=model_specs) model = model_config["model"] balance = generate_balance(min(num_devices, 4), len(model)) pipe_model = Pipe(model, balance, chunks=args.chunks, checkpoint=args.checkpoint) del model del model_config["model"] if args.dry_run: train(model_config, pipe_model, benchmark_config, model_specs, args) else: benchmark_language_model(model_config, pipe_model, benchmark_config, model_specs, args)
def make_model(device, ntokens): ninp = 50 # embedding dimension nhid = 50 # the dimension of the feedforward network model in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0 initrange = 0.1 model = TransformerLMSequntial(ntokens, ninp, nhead, nhid, dropout, initrange).half().to(device) balance = generate_balance(min(num_devices, 4), len(model)) p = Pipe(model, balance, chunks=len(balance)) criterion = nn.CrossEntropyLoss() lr = 0.001 # learning rate try: optimizer = Adam(p.parameters(), lr=lr, precision=Precision.PURE_FP16) except NameError: optimizer = Adam(p.parameters(), lr=lr) scaler = GradScaler() return p, criterion, optimizer, scaler
def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, decoder_module_list=None, ): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) try: from fairscale.nn import Pipe except ImportError: raise ImportError( "Please install fairscale with: pip install fairscale") if decoder_module_list is None: embedding_layer = TransformerDecoderEmbedding(args, embed_tokens) layers = [ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ] decoder_output_layer = TransformerDecoderOutputLayer( args, embed_tokens, dictionary) decoder_module_list = [embedding_layer ] + layers + [decoder_output_layer] self.use_pipeline = getattr(args, "pipeline_decoder_balance", None) is not None if self.use_pipeline: decoder_balance = utils.eval_str_list( args.pipeline_decoder_balance, type=int) decoder_devices = utils.eval_str_list( args.pipeline_decoder_devices, type=int) assert sum(decoder_balance) == len(decoder_module_list), ( f"Sum of decoder_balance={decoder_balance} is not equal " + f"to num_decoder_modules={len(decoder_module_list)}") self.model = Pipe( module=nn.Sequential(*decoder_module_list), balance=decoder_balance, devices=decoder_devices, chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, ) else: self.embedding_layer = decoder_module_list[0] self.decoder_layers = nn.Sequential(*decoder_module_list[1:-1]) self.decoder_output_layer = decoder_module_list[-1]
def benchmark_single_process(args): """Benchmark a given model using a single process and multiple devices.""" num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1 assert num_devices > 0 init_random_seed(0) benchmark_config = create_benchmark_config(args.model_name) model_config = create_model_config(args, benchmark_config=benchmark_config) model = model_config["model"] balance = generate_balance(min(num_devices, 4), len(model)) pipe_model = Pipe(model, balance, chunks=args.chunks, checkpoint=args.checkpoint) del model del model_config["model"] if args.dry_run: train(model_config, pipe_model, benchmark_config, args) else: benchmark_language_model(model_config, pipe_model, benchmark_config, args)
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument("--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)") parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)") parser.add_argument("--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)") parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)") parser.add_argument("--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)") parser.add_argument("--dry-run", action="store_true", default=False, help="quickly check a single pass") parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument( "--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status", ) parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model") args = parser.parse_args() torch.manual_seed(args.seed) kwargs = {"batch_size": args.batch_size} kwargs.update({"num_workers": 1, "pin_memory": True, "shuffle": True}, ) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform) dataset2 = datasets.MNIST("../data", train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1, **kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) model = net model = Pipe(model, balance=[6, 6], devices=[0, 1], chunks=2) device = model.devices[0] optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): tic = time.perf_counter() train(args, model, device, train_loader, optimizer, epoch) toc = time.perf_counter() print(f">>> TRANING Time {toc - tic:0.4f} seconds") tic = time.perf_counter() test(model, device, test_loader) toc = time.perf_counter() print(f">>> TESTING Time {toc - tic:0.4f} seconds") scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")