def make_adam(params): if args.ddp_zero: return OSS(params=params, optim=Adam, group=get_data_parallel_group(), lr=lr) else: return Adam(params, lr=lr)
def make_adam(model): if args.ddp_zero: return OSS(params=model.parameters(), optim=Adam, group=get_data_parallel_group(), lr=lr) else: return Adam(model.parameters(), lr=lr)
def run_test_initialize_model_parallel(rank, model_parallel_size, filename, filename_rpc): dist_init(rank, model_parallel_size, filename, filename_rpc) if torch.distributed.get_rank() == 0: print("> testing initialize_model_parallel with size {} ...".format( model_parallel_size)) model_parallel_size_ = min(model_parallel_size, torch.distributed.get_world_size()) assert not mpu.model_parallel_is_initialized() mpu.initialize_model_parallel(model_parallel_size_) assert mpu.model_parallel_is_initialized() # Checks. def check(group, world_size, rank): assert world_size == torch.distributed.get_world_size(group=group) assert rank == torch.distributed.get_rank(group=group) # Model parallel. world_size = model_parallel_size_ rank = torch.distributed.get_rank() % model_parallel_size_ assert world_size == mpu.get_model_parallel_world_size() assert rank == mpu.get_model_parallel_rank() check(mpu.get_model_parallel_group(), world_size, rank) # Data parallel. world_size = torch.distributed.get_world_size() // model_parallel_size_ rank = torch.distributed.get_rank() // model_parallel_size assert world_size == mpu.get_data_parallel_world_size() assert rank == mpu.get_data_parallel_rank() check(mpu.get_data_parallel_group(), world_size, rank) # Reset groups mpu.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(">> passed the test :-)")
def train(model_config, model, benchmark_config, args): lm_dataloader, _, _ = model_config["data"] criterion = benchmark_config["criterion"] vocab_size = benchmark_config["vocab_size"] optimizer = model_config["optimizer"] model.train() log_number_of_parameters(model) total_loss = 0.0 word_counter = 0 optimizer = optimizer(model.parameters()) pipe_group = model.group if hasattr(model, "group") else None if args.ddp_zero: model = DDP( model, device_ids=[torch.cuda.current_device()], process_group=get_data_parallel_group(), find_unused_parameters=False, ) # TODO(anj-s): Avoid sending fake data to all replicas except the first and last one. if pipe_group and pipe_group.rank() != 0 and pipe_group.rank() != (pipe_group.size() - 1): lm_dataloader = get_fake_dataloader(len(lm_dataloader)) total_tokens = 0 total_tokens_per_log_interval = 0 bptt = 2 start_time = time.time() epoch_start_time = 0.0 def get_batch(source): seq_len = len(source) - 1 data = source[0:seq_len] target = source[1 : 1 + seq_len] return data, target for i, batch in enumerate(lm_dataloader): if i == 1: epoch_start_time = time.time() source, target = get_batch(batch) if args.max_batch and i > args.max_batch: break if i > 0: total_tokens += source.numel() optimizer.zero_grad() try: if (pipe_group is None or pipe_group.rank() == 0) and not args.ddp_zero: tmp = source.to(get_device(model, 0)) output = model(tmp) else: output = model(source) except Exception as e: raise RuntimeError(f"training failed on {torch.distributed.get_rank()}") from e if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: target = target.to(get_device(model, -1)) output = output.to(target.device) loss = criterion(output.view(-1, vocab_size), target.view(-1)) if args.ddp_zero: ddp_group = get_data_parallel_group() torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.SUM, group=ddp_group) loss /= ddp_group.size() loss.backward() del target else: if args.ddp_zero: model.module.back_helper(output) else: model.back_helper(output) del output torch.nn.utils.clip_grad_value_(model.parameters(), benchmark_config["clip_value"]) optimizer.step() if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: total_loss += loss.item() log_interval = 1 total_tokens_per_log_interval += source.numel() if i % log_interval == 0 and i > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print( "| batch {:5d} | wps {:5.2f} | loss {:5.2f} | ppl {:8.2f}".format( i, total_tokens_per_log_interval / elapsed, cur_loss, math.exp(cur_loss) ) ) total_tokens_per_log_interval = 0 total_loss = 0 start_time = time.time() if epoch_start_time != 0: wps = total_tokens / (time.time() - epoch_start_time) else: raise RuntimeError( "Unable to benchmark on a single batch. Increase the size " " of the dataset and rerun the benchmark." ) return wps, loss.item()
def train(lm_dataloader, model, criterion, optimizer, vocab_size, args): model.train() from functools import reduce import operator num_params = reduce(operator.add, (reduce(operator.mul, x.size()) for x in model.parameters())) if model.group: total = torch.Tensor([num_params]) if torch.cuda.is_available(): total = total.cuda() torch.distributed.all_reduce(total, group=model.group) logging.info( f"training model, #prams = {num_params}, group: {model.group.rank()}, grank:" f" {torch.distributed.get_rank()}, sizes {model.group.size()}") torch.distributed.barrier() if model.group.rank() == 0: logging.info(f"total #prams = {total.item()}") else: logging.info(f"training model, #prams = {num_params}") vocab_size = 10000 # FIXME total_loss = 0.0 start_time = time.time() word_counter = 0 optimizer = optimizer(model) def get_first_device(model): if isinstance(model, DDP): model = model.module if not torch.cuda.is_available(): return torch.device("cpu") if model.devices: return model.devices[0] else: return torch.cuda.current_device() def get_last_device(model): if isinstance(model, DDP): model = model.module if not torch.cuda.is_available(): return torch.device("cpu") if model.devices: return model.devices[-1] else: return torch.cuda.current_device() pipe_group = model.group if args.ddp_zero: model = DDP( model, device_ids=[torch.cuda.current_device()], process_group=get_data_parallel_group(), find_unused_parameters=False, ) if pipe_group and pipe_group.rank() != 0 and pipe_group.rank() != ( pipe_group.size() - 1): thing = {"input": torch.zeros(args.batch_size)} class FakeDataset: def __getitem__(self, index): return thing def __len__(self): return len(lm_dataloader) lm_dataloader = FakeDataset() for i, batch in enumerate(lm_dataloader): bi = batch["input"] if args.max_batch and i > args.max_batch: break optimizer.zero_grad() try: if (pipe_group is None or pipe_group.rank() == 0) and not args.ddp_zero: tmp = batch["input"].to(get_first_device(model)) output = model(tmp) else: output = model(batch["input"]) except Exception as e: raise RuntimeError( f"training failed on {torch.distributed.get_rank()}") from e if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: target = batch["target"].to(get_last_device(model)) output = output.to(target.device) loss = criterion(output.view(-1, vocab_size), target.view(-1)) if args.ddp_zero: ddp_group = get_data_parallel_group() torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.SUM, group=ddp_group) loss /= ddp_group.size() loss.backward() del target else: if args.ddp_zero: model.module.back_helper(output) else: model.back_helper(output) del output torch.nn.utils.clip_grad_value_(model.parameters(), 0.05) optimizer.step() if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: total_loss += loss.item() log_interval = 1 word_counter += batch["ntokens"] if i % log_interval == 0 and i > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print( "| batch {:5d} | wps {:5.2f} | loss {:5.2f} | ppl {:8.2f}". format(i, word_counter / elapsed, cur_loss, math.exp(cur_loss))) word_counter = 0 total_loss = 0 start_time = time.time()
def train(data_config, model, benchmark_config, args): lm_dataloader = data_config["data"] criterion = benchmark_config["criterion"] vocab_size = benchmark_config["vocab_size"] optimizer = data_config["optimizer"] model.train() log_number_of_parameters(model) total_loss = 0.0 start_time = time.time() word_counter = 0 optimizer = optimizer(model.parameters()) pipe_group = model.group if args.ddp_zero: model = DDP( model, device_ids=[torch.cuda.current_device()], process_group=get_data_parallel_group(), find_unused_parameters=False, ) # TODO(anj-s): Avoid sending fake data to all replicas except the first and last one. if pipe_group and pipe_group.rank() != 0 and pipe_group.rank() != (pipe_group.size() - 1): lm_dataloader = get_fake_dataloader(len(lm_dataloader)) total_tokens = 0 total_tokens_per_log_interval = 0 for i, batch in enumerate(lm_dataloader): if args.max_batch and i > args.max_batch: break total_tokens += batch["input"].numel() optimizer.zero_grad() try: if (pipe_group is None or pipe_group.rank() == 0) and not args.ddp_zero: tmp = batch["input"].to(get_device(model, 0)) output = model(tmp) else: output = model(batch["input"]) except Exception as e: raise RuntimeError(f"training failed on {torch.distributed.get_rank()}") from e if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: target = batch["target"].to(get_device(model, -1)) output = output.to(target.device) loss = criterion(output.view(-1, vocab_size), target.view(-1)) if args.ddp_zero: ddp_group = get_data_parallel_group() torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.SUM, group=ddp_group) loss /= ddp_group.size() loss.backward() del target else: if args.ddp_zero: model.module.back_helper(output) else: model.back_helper(output) del output torch.nn.utils.clip_grad_value_(model.parameters(), benchmark_config["clip_value"]) optimizer.step() if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: total_loss += loss.item() log_interval = 1 total_tokens_per_log_interval += batch["input"].numel() if i % log_interval == 0 and i > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print( "| batch {:5d} | wps {:5.2f} | loss {:5.2f} | ppl {:8.2f}".format( i, total_tokens_per_log_interval / elapsed, cur_loss, math.exp(cur_loss) ) ) total_tokens_per_log_interval = 0 total_loss = 0 start_time = time.time() return total_tokens, loss.item()