def dist_init(local_rank, num_procs, *func_args, **func_kwargs): """Initialize deepspeed.comm and execute the user function. """ os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = get_master_port() os.environ['LOCAL_RANK'] = str(local_rank) # NOTE: unit tests don't support multi-node so local_rank == global rank os.environ['RANK'] = str(local_rank) os.environ['WORLD_SIZE'] = str(num_procs) # turn off NCCL logging if set os.environ.pop('NCCL_DEBUG', None) set_cuda_visibile() deepspeed.init_distributed(dist_backend=backend) #dist.init_process_group(backend=backend) dist.barrier() if torch.cuda.is_available(): torch.cuda.set_device(local_rank) run_func(*func_args, **func_kwargs) # make sure all ranks finish at the same time dist.barrier() # tear down after test completes dist.destroy_process_group()
def init_deepspeed_comm(backend): global dist import deepspeed import deepspeed.comm as dist deepspeed.init_distributed(dist_backend=backend) local_rank = int(os.environ['LOCAL_RANK']) torch.cuda.set_device(local_rank)
def __init__(self, context: det_ds.DeepSpeedTrialContext): self.context = context self.hparams = attrdict.AttrDict(context.get_hparams()) if (self.hparams.test_manual_init_distributed or self.hparams.test_fail_manual_init_distributed): assert (not torch.distributed.is_initialized() ), "distributed backend should not be initialized" if (self.hparams.test_manual_init_distributed and not self.hparams.test_fail_manual_init_distributed): deepspeed.init_distributed(auto_mpi_discovery=False) if self.hparams.test_manual_grad_acc or self.hparams.test_fail_manual_grad_acc: self.context.disable_auto_grad_accumulation() if self.hparams.test_manual_dataloader: self.context.disable_dataset_reproducibility_checks() self.ds_config = attrdict.AttrDict(self.hparams.deepspeed_config) model = torch.nn.Linear(1, 1) self.model, optimizer, _, _ = deepspeed.initialize( model=model, config=self.ds_config, model_parameters=model.parameters(), dist_init_required=False, ) self.model = self.context.wrap_model_engine(self.model) self.loss = torch.nn.MSELoss() self.reducer = None if self.hparams.test_custom_reducer: self.reducer = self.context.wrap_reducer(lambda x: np.mean(x) * 2, name="loss_2x")
def initialize_distributed(args): """Initialize torch.distributed.""" if args.deepspeed: deepspeed.init_distributed(dist_backend=args.distributed_backend) else: # Manually set the device ids. device = args.rank % torch.cuda.device_count() # Call the init process init_method = 'tcp://' master_ip = os.getenv('MASTER_ADDR', 'localhost') master_port = os.getenv('MASTER_PORT', '6000') init_method += master_ip + ':' + master_port torch.distributed.init_process_group( backend=args.distributed_backend, world_size=args.world_size, rank=args.rank, init_method=init_method) if args.local_rank is not None: device = args.local_rank torch.cuda.set_device(device) # Set the model-parallel / data-parallel communicators. mpu.initialize_model_parallel(args.model_parallel_size) # Optional DeepSpeed Activation Checkpointing Features # if args.deepspeed and args.deepspeed_activation_checkpointing: set_deepspeed_activation_checkpointing(args)
def _dist_init(self, local_rank, num_procs, skip_msg): """Initialize deepspeed.comm and execute the user function. """ if self.set_dist_env: os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = get_master_port() os.environ['LOCAL_RANK'] = str(local_rank) # NOTE: unit tests don't support multi-node so local_rank == global rank os.environ['RANK'] = str(local_rank) os.environ['WORLD_SIZE'] = str(num_procs) # turn off NCCL logging if set os.environ.pop('NCCL_DEBUG', None) set_cuda_visibile() if self.init_distributed: deepspeed.init_distributed(dist_backend=self.backend) dist.barrier() if torch.cuda.is_available(): torch.cuda.set_device(local_rank) try: self.current_test(**self.test_kwargs) except BaseException as e: if isinstance(e, Skipped): skip_msg.put(e.msg) else: raise e if self.init_distributed or dist.is_initialized(): # make sure all ranks finish at the same time dist.barrier() # tear down after test completes dist.destroy_process_group()
def _setup_devices(self) -> "torch.device": logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") self._n_gpu = 0 elif is_torch_tpu_available(): device = xm.xla_device() self._n_gpu = 0 elif is_sagemaker_mp_enabled(): local_rank = smp.local_rank() device = torch.device("cuda", local_rank) self._n_gpu = 1 elif is_sagemaker_dp_enabled(): sm_dist.init_process_group() self.local_rank = sm_dist.get_local_rank() device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.deepspeed: # deepspeed performs its own DDP internally, and requires the program to be started with: # deepspeed ./program.py # rather than: # python -m torch.distributed.launch --nproc_per_node=2 ./program.py from .integrations import is_deepspeed_available if not is_deepspeed_available(): raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.") import deepspeed deepspeed.init_distributed() # workaround for setups like notebooks where the launcher can't be used, # but deepspeed requires a dist env. # env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed self.local_rank = int(os.environ.get("LOCAL_RANK", "-1")) device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will # trigger an error that a device index is missing. Index 0 takes into account the # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` # will use the first GPU in that env, i.e. GPU#1 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at # the default value. self._n_gpu = torch.cuda.device_count() else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of synchronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) self._n_gpu = 1 if device.type == "cuda": torch.cuda.set_device(device) return device
def init_deepspeed(do_init): """Initialize the DeepSpeed distributed backend.""" global using_deepspeed using_deepspeed = do_init if not do_init: return deepspeed.init_distributed()
def test(self): torch.distributed.init_process_group( backend='nccl', init_method=f"tcp://127.0.0.1:{get_master_port()}", world_size=1, rank=0) assert torch.distributed.is_initialized() deepspeed.init_distributed('nccl', auto_mpi_discovery=True)
def test_no_init(self, dist_init_required): if dist_init_required or dist_init_required is None: deepspeed.init_distributed('nccl', dist_init_required=dist_init_required) else: # torch.dist is not done and for some reason the user says they don't want it done with pytest.raises(Exception): deepspeed.init_distributed( 'nccl', dist_init_required=dist_init_required)
def _init_deepspeed_distributed(self) -> None: if platform.system() != "Windows": # do not set env variables on windows, allow deepspeed to control setup self._set_node_environment_variables() log.info( "initializing deepspeed distributed: " f"GLOBAL_RANK: {self.global_rank}, " f"MEMBER: {self.global_rank + 1}/{self.world_size}" ) deepspeed.init_distributed(self.torch_distributed_backend, distributed_port=self.cluster_environment.main_port)
def _setup_devices(self) -> Tuple["torch.device", int]: logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") n_gpu = 0 elif is_torch_tpu_available(): device = xm.xla_device() n_gpu = 0 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will # trigger an error that a device index is missing. Index 0 takes into account the # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` # will use the first GPU in that env, i.e. GPU#1 device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at # the default value. if self._n_gpu == -1: self._n_gpu = torch.cuda.device_count() n_gpu = self._n_gpu else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of synchronizing nodes/GPUs # # deepspeed performs its own DDP internally, and requires the program to be started with: # deepspeed ./program.py # rather than: # python -m torch.distributed.launch --nproc_per_node=2 ./program.py if self.deepspeed: from .integrations import is_deepspeed_available if not is_deepspeed_available(): raise ImportError( "--deepspeed requires deepspeed: `pip install deepspeed`." ) import deepspeed deepspeed.init_distributed() else: torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) n_gpu = 1 if device.type == "cuda": torch.cuda.set_device(device) return device, n_gpu
def dist_init(local_rank, num_procs, *func_args, **func_kwargs): """Initialize torch.distributed and execute the user function. """ os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '29503' os.environ['LOCAL_RANK'] = str(local_rank) # NOTE: unit tests don't support multi-node so local_rank == global rank os.environ['RANK'] = str(local_rank) os.environ['WORLD_SIZE'] = str(num_procs) deepspeed.init_distributed(dist_backend=backend) if torch.cuda.is_available(): torch.cuda.set_device(local_rank) run_func(*func_args, **func_kwargs)
def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Optional[int] = None) -> None: if platform.system() != "Windows": # do not set env variables on windows, allow deepspeed to control setup global_rank = global_rank if global_rank is not None else self.cluster_environment.global_rank( ) world_size = world_size if world_size is not None else self.cluster_environment.world_size( ) self._set_node_environment_variables(global_rank, world_size) log.info("initializing deepspeed distributed: " f"GLOBAL_RANK: {global_rank}, " f"MEMBER: {global_rank + 1}/{world_size}") deepspeed.init_distributed( self.torch_distributed_backend, distributed_port=self.cluster_environment.master_port())
def prepare_model_optimizer(self, model): # Initialize torch distributed deepspeed.init_distributed(dist_backend="nccl") # FIXME from dataclasses import dataclass @dataclass class TmpClass: local_rank: int fake_arg = TmpClass(self.fs_args.device_id) # DeepSpeed initializer handles FP16, distributed, optimizer automatically. self.model, self.optimizer, _, _ = deepspeed.initialize( args=fake_arg, model=model, model_parameters=model.parameters(), config_params=self.ds_config, )
def setup_process(self, rank: int = -1, world_size: int = 1): """Initialize DDP variables and processes. Args: rank: process rank. Default is `-1`. world_size: number of devices in netwok to expect for train. Default is `1`. """ self._rank = rank self._world_size = world_size torch.cuda.set_device(int(self._rank)) self._device = f"cuda:{int(self._rank)}" os.environ["RANK"] = str(rank) os.environ["LOCAL_RANK"] = str(rank) os.environ["WORLD_SIZE"] = str(world_size) os.environ["MASTER_ADDR"] = str(self.address) os.environ["MASTER_PORT"] = str(self.port) deepspeed.init_distributed(**self.process_group_kwargs)
def prepare_model_optimizer(args): # Initialize torch distributed deepspeed.init_distributed(dist_backend='nccl') args.local_rank = int(os.environ['LOCAL_RANK']) # Loading Model model = BertMultiTask(args) # Optimizer parameters optimizer_grouped_parameters = prepare_optimizer_parameters(args, model) # DeepSpeed initializer handles FP16, distributed, optimizer automatically. model.network, optimizer, _, _ = deepspeed.initialize( args=args, model=model.network, model_parameters=optimizer_grouped_parameters) # Overwrite application configs with DeepSpeed config args.train_micro_batch_size_per_gpu = model.network.train_micro_batch_size_per_gpu( ) args.gradient_accumulation_steps = model.network.gradient_accumulation_steps( ) # Set DeepSpeed info args.local_rank = model.network.local_rank args.device = model.network.device model.set_device(args.device) args.fp16 = model.network.fp16_enabled() args.use_lamb = (model.network.optimizer_name() == deepspeed.runtime.config.LAMB_OPTIMIZER or model.network.optimizer_name() == deepspeed.runtime.config.ONEBIT_LAMB_OPTIMIZER) # Prepare Summary Writer and saved_models path if dist.get_rank() == 0: summary_writer = get_sample_writer(name=args.job_name, base=args.output_dir) args.summary_writer = summary_writer os.makedirs(args.saved_model_path, exist_ok=True) return model, optimizer
def pre_execute_hook( cls: Type["DeepSpeedTrialController"], env: det.EnvContext, distributed_backend: det._DistributedBackend, ) -> None: # We use an environment variable to allow users to enable custom initialization routine for # distributed training since the pre_execute_hook runs before trial initialization. manual_dist_init = os.environ.get("DET_MANUAL_INIT_DISTRIBUTED") if not manual_dist_init: # DeepSpeed's init_distributed handles situations in which only 1 gpu is used and # also handles multiple calls to init in one process. deepspeed.init_distributed(auto_mpi_discovery=False) # Set identical random seeds on all training processes. # When data parallel world size > 1, each data parallel rank will start at a unique # offset in the dataset, ensuring it's processing a unique # training batch. # TODO (Liam): seed data loading workers so that we can configure different seeds for # data augmentations per slot per worker. random.seed(env.trial_seed) np.random.seed(env.trial_seed) torch.random.manual_seed(env.trial_seed)
def dist_init(local_rank, num_procs, *func_args, **func_kwargs): """Initialize torch.distributed and execute the user function.""" os.environ["MASTER_ADDR"] = "127.0.0.1" os.environ["MASTER_PORT"] = get_master_port() os.environ["LOCAL_RANK"] = str(local_rank) # NOTE: unit tests don't support multi-node so local_rank == global rank os.environ["RANK"] = str(local_rank) os.environ["WORLD_SIZE"] = str(num_procs) # turn off NCCL logging if set os.environ.pop("NCCL_DEBUG", None) deepspeed.init_distributed(dist_backend=backend) if torch.cuda.is_available(): torch.cuda.set_device(local_rank) run_func(*func_args, **func_kwargs) # make sure all ranks finish at the same time torch.distributed.barrier() # tear down after test completes torch.distributed.destroy_process_group()
def train(local_rank, args): # torch.multiprocessing.set_sharing_strategy('file_system') # too many barriers / one node data parallel and multiple node DDP os.environ['MASTER_ADDR'] = args["master_addr"] os.environ['MASTER_PORT'] = args["master_port"] os.environ["NCCL_DEBUG"] = "WARN" # os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank) # gpu_device = 0 gpu_device = local_rank os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if args["wandb_dryrun"]: os.environ["WANDB_MODE"] = "dryrun" os.environ["WANDB_SILENT"] = "true" os.environ['TOKENIZERS_PARALLELISM'] = "true" torch.backends.cudnn.benchmark = True rank = args["nr"] if args["cpu"] else (args["nr"] * args["gpus_per_node"] + local_rank) nr = args["nr"] device = torch.device( f'cuda:{gpu_device}') # Unique only on individual node. torch.cuda.set_device(device) print("[Train]: Time = %s, Prepare to init Dist Process for Rank = %s" % (get_time_string(), rank)) if args["nr"] == 0: args["master_addr"] = "0.0.0.0" init_method = "tcp://%s:%s" % (args["master_addr"], args["master_port"]) deepspeed.init_distributed(distributed_port=int(args["master_port"]), init_method=init_method) format = "%Y-%m-%d %H-%M %Z" # + timedelta(hours=5, minutes=30) time_string = (datetime.fromtimestamp( time.mktime(time.gmtime(rnd.cpu().item())))).astimezone( timezone('Asia/Kolkata')).strftime(format) ds_name = list( filter(lambda x: len(x.strip()) > 0, args["train_dataset"].split("/")))[-1].replace( "train_fastformer_resampled_", "") group = "%s-%s-nodes-%s" % (ds_name, args["nodes"], time_string) set_seeds(args["seed"]) mconf = model_config.to_dict() config = dict(md_config=md_config, sm_config=sm_config)[mconf.pop("model_size")] tokenizer = get_tokenizer(mconf.pop("tokenizer_name")) config.vocab_size = len(tokenizer) + 22 config.tokenizer_length = 1024 config.tokenizer_length = config.tokenizer_length - config.num_highway_cls_tokens config.max_position_embeddings = config.max_position_embeddings + config.num_highway_cls_tokens collate_fn = get_collate_fn(config.num_highway_cls_tokens, tokenizer.pad_token_id) model = FastFormerForFusedELECTRAPretraining(config, tokenizer=tokenizer, **mconf).to(device) print("[Train]: Trainable Params = %s" % (numel(model) / 1_000_000)) if args["pretrained_model"] is not None and os.path.exists( args["pretrained_model"]) and rank == 0: model.load_state_dict( torch.load(args["pretrained_model"], map_location='cuda:%d' % gpu_device)) model_engine, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) model_save_dir = args["model_save_dir"] model_save_name = args["model_save_name"] if local_rank == 0: if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) assert os.path.exists(model_save_dir) shuffle_dataset = args["shuffle_dataset"] train_loader = build_dataloader(args["train_dataset"], shuffle_dataset, sampling_fraction, config, collate_fn, tokenizer, world_size=args["world_size"], num_workers=args["num_workers"], no_autocast=args["no_autocast"]) print("[Train]: Data Loaded for Rank = %s" % rank) log_every_steps = args["log_every_steps"] save_every_steps = args["save_every_steps"] print("[Train]: Scheduler Created for Rank = %s" % rank) other_load_details = None if "resume" in args and isinstance( args["resume"], str) and len(args["resume"].strip()) > 0: _, other_load_details = model_engine.load_checkpoint( model_save_dir, args["resume"]) step = other_load_details['step'] else: print("[Train]: No Resume for Rank = %s" % rank) _ = model.train() # print("[Train]: Init Wandb-watch added over model for Rank = %s" % rank) # wandb.watch(model, log="all", log_freq=log_every_steps) print("[Train]: WandB-watch added over model for Rank = %s" % rank) batch_times = [] model_times = [] full_times = [] samples_processed = 0 samples_processed_this_log_iter = 0 print("[Train]: Time = %s, Start Training for Rank = %s" % (get_time_string(), rank)) if local_rank == 0: wandb_init_args = dict(project="fastformer", name="%s-%s-%s-%s" % (group, args["nr"], rank, local_rank), group=group, id=f"{group}-worker-{nr}-{rank}-{local_rank}", config={ "args": args, "model_config": mconf, "config": config, "optimizer_config": optc }, settings=wandb.Settings(start_method="fork")) time.sleep(random.random() * 5) wandb.init(**wandb_init_args) if args["detect_anomaly"]: torch.autograd.set_detect_anomaly(True) def get_hook(name_of_param=None): if name_of_param is None: def hook(grad): is_nan_inf = torch.logical_not(torch.isfinite(grad)) if is_nan_inf.any(): grad = torch.where( is_nan_inf, torch.sign(grad) * torch.empty_like(grad).fill_(1e-3), grad) # grad = torch.clamp_(grad, -1e1, 1e1) return grad else: return None return hook else: def named_hook(grad): is_nan_inf = torch.logical_not(torch.isfinite(grad)) if is_nan_inf.any(): print( "[GRAD-HOOK]: Time = %s, Param Name = %s, Detected Nan/Inf" % (get_time_string(), name_of_param)) grad = torch.where( is_nan_inf, torch.sign(grad) * torch.empty_like(grad).fill_(1e-3), grad) # grad = torch.clamp_(grad, -1e1, 1e1) return grad else: return None return named_hook if args["backward_hook"]: for name, param in model.named_parameters(): if "embeddings" in name or "sent_predict_fc" in name or "embed_proj_transpose" in name or "embed_proj" in name or "lm_head" in name or "contrastive_ffn" in name or "encoder.blocks.0" in name: # param.register_hook(get_hook()) else: param.register_hook(get_hook()) start_time = time.time() for step, batch in enumerate(train_loader): gen_batch_time = time.time() - start_time batch_times.append(gen_batch_time) bs_size = list(batch["input_ids"].size()) batch = { k: v.to(device, non_blocking=True) if hasattr(v, "to") else v for k, v in batch.items() } electra_loss_w = float( ((step + 1) / (2 * 10000)) * mconf["electra_loss_w"]) model_engine.model.electra_loss_w = electra_loss_w if (step + 1) % save_every_steps == 0: client_sd = dict() client_sd['step'] = step ckpt_id = step model_engine.save_checkpoint(model_save_dir, ckpt_id, client_sd=client_sd) record_accuracy = False if (step + 1) % log_every_steps == 0: if local_rank == 0: record_accuracy = True batch["record_accuracy"] = record_accuracy labels = batch[ "label_mlm_input_ids"] if "label_mlm_input_ids" in batch else batch[ "input_ids"] labels = labels.to(device, non_blocking=True) model_start_time = time.time() samples_processed += int(batch["input_ids"].size(0)) samples_processed_this_log_iter += int(batch["input_ids"].size(0)) # clean_memory() # print("Step = %s, Before:, for Rank = %s, input_size = %s, Allocated = %.3f, Max Allocated = %.3f, Percent = %s" % # (step, rank, batch["input_ids"].size(), torch.cuda.memory_allocated() / 1e6, torch.cuda.max_memory_allocated() /1e6, torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated())) # torch.cuda.memory_summary() # forward() method output = model_engine(**batch, labels=labels) loss = output["loss"] # runs backpropagation model_engine.backward(loss) # weight update model_engine.step() # clean_memory() # print("Step = %s, After: , for Rank = %s, input_size = %s, Allocated = %.3f, Max Allocated = %.3f, Percent = %s" % # (step, rank, batch["input_ids"].size(), torch.cuda.memory_allocated() / 1e6, torch.cuda.max_memory_allocated() / 1e6, # torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated())) # torch.cuda.memory_summary() model_end_time = time.time() - model_start_time model_times.append(model_end_time) full_time = time.time() - start_time full_times.append(full_time) if step == 0: print("[Train]: Time = %s, First Batch Training for Rank = %s" % (get_time_string(), rank)) if (step + 1) % log_every_steps == 0: if local_rank == 0: samples_per_second = samples_processed_this_log_iter / np.sum( full_times) acc_dict = output["accuracy_hist"] loss_dict = output["loss_dict"] time.sleep(random.random() + 0.1) wandb.log( dict(lr=optimizer.param_groups[0]['lr'], step=step, samples_processed=samples_processed, samples_per_second=samples_per_second, batch_x_sequence=np.prod(bs_size[:2]), batch_times=np.mean(batch_times), model_times=np.mean(model_times), full_times=np.mean(full_times), scale=scaler.get_scale(), **loss_dict, **acc_dict)) print( "[Train]: Time = %s, Rank = %s, steps = %s, samples_processed=%s, batch_size = %s, Loss = %s, Accuracy = %s, LR = %s" % (get_time_string(), rank, step, samples_processed, bs_size, loss_dict, output["accuracy_hist"], optimizer.param_groups[0]['lr'])) print( "[Train-Timings]: Time = %s, Batch time = %.4f, Model Time = %.4f, Full time = %.4f, samples_per_second = %s" % (get_time_string(), np.mean(batch_times), np.mean(model_times), np.mean(full_times), samples_per_second)) del acc_dict del loss_dict batch_times = [] model_times = [] full_times = [] samples_processed_this_log_iter = 0 del batch del labels del output del bs_size start_time = time.time()
def worker(proc_id, gpu_ranks, args, model): """ Args: proc_id: The id of GPU for single GPU mode; The id of process (and GPU) for multiprocessing distributed mode. gpu_ranks: List of ranks of each process. """ set_seed(args.seed) # Get logger args.logger = init_logger(args) if args.deepspeed: import deepspeed deepspeed.init_distributed(dist_backend=args.backend) rank = dist.get_rank() gpu_id = proc_id elif args.dist_train: rank = gpu_ranks[proc_id] gpu_id = proc_id elif args.single_gpu: rank = None gpu_id = proc_id else: rank = None gpu_id = None if args.dist_train: train_loader = str2dataloader[args.data_processor]( args, args.dataset_path, args.batch_size, rank, args.world_size, True) else: train_loader = str2dataloader[args.data_processor](args, args.dataset_path, args.batch_size, 0, 1, True) # Build optimizer. param_optimizer = list(model.named_parameters()) no_decay = ["bias", "gamma", "beta"] optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] if args.optimizer in ["adamw"]: custom_optimizer = str2optimizer[args.optimizer]( optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) else: custom_optimizer = str2optimizer[args.optimizer]( optimizer_grouped_parameters, lr=args.learning_rate, scale_parameter=False, relative_step=False) if args.scheduler in ["constant"]: custom_scheduler = str2scheduler[args.scheduler](custom_optimizer) elif args.scheduler in ["constant_with_warmup"]: custom_scheduler = str2scheduler[args.scheduler]( custom_optimizer, args.total_steps * args.warmup) else: custom_scheduler = str2scheduler[args.scheduler]( custom_optimizer, args.total_steps * args.warmup, args.total_steps) if args.deepspeed: model, optimizer, _, scheduler = deepspeed.initialize( model=model, model_parameters=optimizer_grouped_parameters, args=args, optimizer=custom_optimizer, lr_scheduler=custom_scheduler, mpu=None, dist_init_required=False) else: if gpu_id is not None: model.cuda(gpu_id) optimizer = custom_optimizer scheduler = custom_scheduler if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if args.dist_train: # Initialize multiprocessing distributed training environment. dist.init_process_group(backend=args.backend, init_method=args.master_ip, world_size=args.world_size, rank=rank) model = DistributedDataParallel(model, device_ids=[gpu_id], find_unused_parameters=True) args.logger.info("Worker %d is training ... " % rank) else: args.logger.info("Worker is training ...") trainer = str2trainer[args.data_processor](args) trainer.train(args, gpu_id, rank, train_loader, model, optimizer, scheduler)
def test_already_init(self, dist_init_required): torch.distributed.init_process_group('nccl') deepspeed.init_distributed('nccl', dist_init_required=dist_init_required)
import gpt_neox WORLD_SIZE = os.getenv('WORLD_SIZE') # arguments train_args = get_args() params = get_params(train_args.model) # tokenizer tokenizer = get_tokenizer(tokenizer_type=params["tokenizer"].get("type", None), from_pretrained=params["tokenizer"].get("from_pretrained", True), add_padding_token=params["tokenizer"].get("add_padding_token", False)) vocab_size = len(tokenizer) if params["vocab_size"] is None else params["vocab_size"] # model deepspeed.init_distributed(dist_backend='nccl') torch.distributed.barrier() # barrier will force processes to stop until *all* processes have reached the barrier def loss_function(x, y): losses = torch.nn.functional.cross_entropy(x, y, reduction='none') loss = losses.mean() return loss model = GPTNeoX_Pipe( num_tokens=params["vocab_size"], dim=params["hidden_dim"], seq_len=params["seq_len"], depth=params["n_layers"], heads=params["n_heads"], dim_head=params["dim_head"], loss_fn = loss_function,#torch.nn.CrossEntropyLoss(),
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") tokenizer_opts(parser) parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") deepspeed_opts(parser) parser.add_argument("--mp_size", type=int, default=1, help="Model parallel size.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False deepspeed.init_distributed() model = Classifier(args) if args.load_model_path: model = load_model(model, args.load_model_path) model = deepspeed.init_inference(model=model, mp_size=args.mp_size, replace_method=None) rank = dist.get_rank() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if rank == 0: dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") if args.output_logits: f.write("\t" + "logits") if args.output_prob: f.write("\t" + "prob") f.write("\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() prob = nn.Softmax(dim=1)(logits) logits = logits.cpu().numpy().tolist() prob = prob.cpu().numpy().tolist() for j in range(len(pred)): f.write(str(pred[j])) if args.output_logits: f.write("\t" + " ".join([str(v) for v in logits[j]])) if args.output_prob: f.write("\t" + " ".join([str(v) for v in prob[j]])) f.write("\n")
def main(): parser = get_argument_parser() deepspeed.init_distributed(dist_backend='nccl') # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() args.local_rank = int(os.environ['LOCAL_RANK']) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # model = BertForQuestionAnswering.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) # Support for word embedding padding checkpoints # Prepare model bert_model_config = { "vocab_size_or_config_json_file": 119547, "hidden_size": 1024, "num_hidden_layers": 24, "num_attention_heads": 16, "intermediate_size": 4096, "hidden_act": "gelu", "hidden_dropout_prob": args.dropout, "attention_probs_dropout_prob": args.dropout, "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 512, "type_vocab_size": 2, "initializer_range": 0.02 } if args.ckpt_type == "DS": if args.preln: bert_config = BertConfigPreLN(**bert_model_config) else: bert_config = BertConfig(**bert_model_config) else: # Models from Tensorflow and Huggingface are post-LN. if args.preln: raise ValueError( "Should NOT use --preln if the loading checkpoint doesn't use pre-layer-norm." ) # Use the original bert config if want to load from non-DeepSpeed checkpoint. if args.origin_bert_config_file is None: raise ValueError( "--origin_bert_config_file is required for loading non-DeepSpeed checkpoint." ) bert_config = BertConfig.from_json_file(args.origin_bert_config_file) if bert_config.vocab_size != len(tokenizer.vocab): raise ValueError("vocab size from original checkpoint mismatch.") bert_config.vocab_size = len(tokenizer.vocab) # Padding for divisibility by 8 if bert_config.vocab_size % 8 != 0: vocab_diff = 8 - (bert_config.vocab_size % 8) bert_config.vocab_size += vocab_diff if args.preln: model = BertForQuestionAnsweringPreLN(bert_config, args) else: model = BertForQuestionAnswering(bert_config, args) print("VOCAB SIZE:", bert_config.vocab_size) if args.model_file is not "0": logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}") if args.ckpt_type == "DS": checkpoint_state_dict = torch.load( args.model_file, map_location=torch.device("cpu")) if 'module' in checkpoint_state_dict: logger.info('Loading DeepSpeed v2.0 style checkpoint') model.load_state_dict(checkpoint_state_dict['module'], strict=False) elif 'model_state_dict' in checkpoint_state_dict: model.load_state_dict( checkpoint_state_dict['model_state_dict'], strict=False) else: raise ValueError("Unable to find model state in checkpoint") else: from convert_bert_ckpt_to_deepspeed import convert_ckpt_to_deepspeed convert_ckpt_to_deepspeed(model, args.ckpt_type, args.model_file, vocab_diff, args.deepspeed_transformer_kernel) logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}") # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if args.deepspeed_transformer_kernel: no_decay = no_decay + [ 'attn_nw', 'attn_nb', 'norm_w', 'norm_b', 'attn_qkvb', 'attn_ob', 'inter_b', 'output_b' ] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] model, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=optimizer_grouped_parameters, dist_init_required=True) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs #torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: os.makedirs(args.output_dir, exist_ok=True) # Prepare Summary writer if torch.distributed.get_rank() == 0 and args.job_name is not None: args.summary_writer = get_summary_writer(name=args.job_name, base=args.output_dir) else: args.summary_writer = None logger.info("propagate deepspeed-config settings to client settings") args.train_batch_size = model.train_micro_batch_size_per_gpu() args.gradient_accumulation_steps = model.gradient_accumulation_steps() args.fp16 = model.fp16_enabled() args.print_steps = model.steps_per_print() args.learning_rate = model.get_lr()[0] args.wall_clock_breakdown = model.wall_clock_breakdown() t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() ema_loss = 0. sample_count = 0 num_epoch = 0 global all_step_time ave_rounds = 20 for _ in trange(int(args.num_train_epochs), desc="Epoch"): num_epoch += 1 epoch_step = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", smoothing=0)): start_time = time.time() bs_size = batch[0].size()[0] if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps ema_loss = args.loss_plot_alpha * ema_loss + ( 1 - args.loss_plot_alpha) * loss.item() model.backward(loss) loss_item = loss.item() * args.gradient_accumulation_steps loss = None sample_count += (args.train_batch_size * torch.distributed.get_world_size()) if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step model.step() global_step += 1 epoch_step += 1 if torch.distributed.get_rank( ) == 0 and args.summary_writer: summary_events = [ (f'Train/Steps/lr', lr_this_step, global_step), (f'Train/Samples/train_loss', loss_item, sample_count), (f'Train/Samples/lr', lr_this_step, sample_count), (f'Train/Samples/train_ema_loss', ema_loss, sample_count) ] if args.fp16 and hasattr(optimizer, 'cur_scale'): summary_events.append( (f'Train/Samples/scale', optimizer.cur_scale, sample_count)) write_summary_events(args.summary_writer, summary_events) args.summary_writer.flush() if torch.distributed.get_rank() == 0 and ( step + 1) % args.print_steps == 0: logger.info( f"bert_squad_progress: step={global_step} lr={lr_this_step} loss={ema_loss}" ) else: model.step() if is_time_to_exit(args=args, epoch_steps=epoch_step, global_steps=global_step): logger.info( f'Warning: Early epoch termination due to max steps limit, epoch step ={epoch_step}, global step = {global_step}, epoch = {num_epoch}' ) break one_step_time = time.time() - start_time all_step_time += one_step_time if (step + 1) % ( ave_rounds) == 0 and torch.distributed.get_rank() == 0: print( ' At step {}, averaged throughput for {} rounds is: {} Samples/s' .format( step, ave_rounds, bs_size * ave_rounds * torch.distributed.get_world_size() / all_step_time), flush=True) all_step_time = 0.0 # Save a trained model # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self #output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # if args.do_train: # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned #model_state_dict = torch.load(output_model_file) #model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) # model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
if args.precision == 'bf16': af2features.dtype = torch.bfloat16 elif args.precision == 'fp16': af2features.dtype = torch.float16 af2 = AlphaFold(config=config.model, target_dim=22, msa_dim=49, extra_msa_dim=25).to(device='cuda') os.environ['RANK'] = '0' os.environ['LOCAL_RANK'] = '0' os.environ['WORLD_SIZE'] = '1' os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '6000' deepspeed.init_distributed(auto_mpi_discovery=False) af2, optimizer, _, _ = deepspeed.initialize( model=af2, model_parameters=af2.parameters(), config=args.deepspeed_config_path, dist_init_required=True) with open(args.dataset_dir / args.sample_name, 'rb') as f: raw_features = pickle.load(f) batch = af2features(raw_features) if args.precision == 'fp16': batch = af2features.convert(batch, dtypes={ torch.float32: torch.float16, torch.float64: torch.float32
def train(args): num_epochs = args.epochs local_rank = args.local_rank if local_rank == -1: local_rank = int(os.environ.get('PMIX_RANK', -1)) deepspeed.init_distributed(timeout=timedelta(minutes=5)) world_size = int(os.environ['WORLD_SIZE']) torch.manual_seed(0) # Set up standard model. if local_rank == 0: print('Using {} model'.format(args.model)) model = getattr(models, args.model)() model = model.cuda() criterion = nn.CrossEntropyLoss().cuda() train_dataset = dataset_from_datadir(args.datadir) model_engine, optimizer, train_loader, __ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters(), training_data=train_dataset) # For final average avg_images = 0 avg_start = None tot_steps = 0 for epoch in range(num_epochs): for i, data in enumerate(train_loader): images = data[0].to(model_engine.local_rank) labels = data[1].to(model_engine.local_rank) outputs = model_engine(images) loss = criterion(outputs, labels) model_engine.backward(loss) model_engine.step() li = len(images) # last_images += li tot_steps += 1 if tot_steps == args.warmup_steps: avg_start = datetime.now() elif tot_steps > args.warmup_steps: avg_images += li if args.steps is not None and tot_steps >= args.steps: break if local_rank == 0: if avg_start is None: print( "WARNING: stopped before warmup steps done, not printing stats." ) else: dur = datetime.now() - avg_start print(f"Training completed in: {dur}") print( f"Images/sec: {avg_images*world_size/dur.total_seconds():.2f} " f"(average, skipping {args.warmup_steps} warmup steps)")
net = AlexNet(num_classes=10) net = PipelineModule(layers=join_layers(net), loss_fn=torch.nn.CrossEntropyLoss(), num_stages=args.pipeline_parallel_size, partition_method=part, activation_checkpoint_interval=0) trainset = cifar_trainset(args.local_rank) engine, _, _, _ = deepspeed.initialize( args=args, model=net, model_parameters=[p for p in net.parameters() if p.requires_grad], training_data=trainset) for step in range(args.steps): loss = engine.train_batch() if __name__ == '__main__': args = get_args() deepspeed.init_distributed(dist_backend=args.backend) args.local_rank = int(os.environ['LOCAL_RANK']) torch.cuda.set_device(args.local_rank) if args.pipeline_parallel_size == 0: train_base(args) else: train_pipe(args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument("--world_size", type=int, default=1, help="Total number of processes (GPUs) for training.") tokenizer_opts(parser) parser.add_argument("--soft_targets", action='store_true', help="Train model with logits.") parser.add_argument("--soft_alpha", type=float, default=0.5, help="Weight of the soft targets loss.") deepspeed_opts(parser) args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. args.labels_num = count_labels_num(args.train_path) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model. model = Classifier(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) # Get logger. args.logger = init_logger(args) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "gamma", "beta"] optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] deepspeed.init_distributed() rank = dist.get_rank() args.rank = rank trainset = read_dataset(args, args.train_path, split=True)[args.rank] random.shuffle(trainset) instances_num = len(trainset) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 custom_optimizer, custom_scheduler = build_optimizer(args, model) model, optimizer, _, scheduler = deepspeed.initialize( model=model, model_parameters=optimizer_grouped_parameters, args=args, optimizer=custom_optimizer, lr_scheduler=custom_scheduler, mpu=None, dist_init_required=False) src = torch.LongTensor([example[0] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg = torch.LongTensor([example[2] for example in trainset]) if args.soft_targets: soft_tgt = torch.FloatTensor([example[3] for example in trainset]) else: soft_tgt = None args.model = model args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") total_loss, result, best_result, best_epoch = 0.0, 0.0, 0.0, 0 result_tensor = torch.tensor(result).to(args.device) if args.rank == 0: args.logger.info("Batch size: {}".format(batch_size)) args.logger.info( "The number of training instances: {}".format(instances_num)) args.logger.info("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate( batch_loader(batch_size, src, tgt, seg, soft_tgt)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, soft_tgt_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0 and args.rank == 0: args.logger.info( "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 if args.rank == 0: result = evaluate(args, read_dataset(args, args.dev_path, split=False)) result_tensor = torch.tensor(result[0]).to(args.device) dist.broadcast(result_tensor, 0, async_op=False) if result_tensor.float() >= best_result: best_result = result_tensor.float().item() best_epoch = epoch model.save_checkpoint(args.output_model_path, str(epoch)) # Evaluation phase. if args.test_path is not None and args.rank == 0: args.logger.info("Test set evaluation.") model.load_checkpoint(args.output_model_path, str(best_epoch)) evaluate(args, read_dataset(args, args.test_path, split=False), True)
parser.add_argument("--mp_size", type=int, default=1, help="Model parallel size.") args = parser.parse_args() args.batch_size = 1 args = load_hyperparam(args) args.tokenizer = str2tokenizer[args.tokenizer](args) model = GenerateLm(args) model = load_model(model, args.load_model_path) deepspeed.init_distributed() model = deepspeed.init_inference(model=model, mp_size=args.mp_size, replace_method=None) rank = dist.get_rank() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if rank == 0: model.eval() with open(args.test_path, mode="r", encoding="utf-8") as f: line = f.readline().strip() src = args.tokenizer.convert_tokens_to_ids( [CLS_TOKEN] + args.tokenizer.tokenize(line)) seg = [1] * len(src) beginning_length = len(src)
def main_worker(save_dir, args): # basic setup cudnn.benchmark = True if args.log_name is not None: log_dir = "runs/%s" % args.log_name else: log_dir = f"runs/{datetime.datetime.now().strftime('%m-%d-%H-%M-%S')}" if args.local_rank == 0: logger = SummaryWriter(log_dir) else: logger = None deepspeed.init_distributed(dist_backend='nccl') torch.cuda.set_device(args.local_rank) model = SetVAE(args) parameters = model.parameters() n_parameters = sum(p.numel() for p in parameters if p.requires_grad) print(f'number of params: {n_parameters}') try: n_gen_parameters = sum(p.numel() for p in model.init_set.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.pre_decoder.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.decoder.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.post_decoder.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.output.parameters() if p.requires_grad) print(f'number of generator params: {n_gen_parameters}') except AttributeError: pass optimizer, criterion = model.make_optimizer(args) # initialize datasets and loaders train_dataset, val_dataset, train_loader, val_loader = get_datasets(args) # initialize the learning rate scheduler if args.scheduler == 'exponential': assert not (args.warmup_epochs > 0) scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer, args.exp_decay) elif args.scheduler == 'step': assert not (args.warmup_epochs > 0) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.epochs // 2, gamma=0.1) elif args.scheduler == 'linear': def lambda_rule(ep): lr_w = min(1., ep / args.warmup_epochs) if (args.warmup_epochs > 0) else 1. lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float( 0.5 * args.epochs) return lr_l * lr_w scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif args.scheduler == 'cosine': assert not (args.warmup_epochs > 0) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs) else: # Fake SCHEDULER def lambda_rule(ep): return 1.0 scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) # extract collate_fn if args.distributed: collate_fn = deepcopy(train_loader.collate_fn) model, optimizer, train_loader, scheduler = deepspeed.initialize( args=args, model=model, optimizer=optimizer, model_parameters=parameters, training_data=train_dataset, collate_fn=collate_fn, lr_scheduler=scheduler) # resume checkpoints start_epoch = 0 if args.resume_checkpoint is None and Path( Path(save_dir) / 'checkpoint-latest.pt').exists(): args.resume_checkpoint = os.path.join( save_dir, 'checkpoint-latest.pt') # use the latest checkpoint print('Resumed from: ' + args.resume_checkpoint) if args.resume_checkpoint is not None: if args.distributed: if args.resume_optimizer: model.module, model.optimizer, model.lr_scheduler, start_epoch = resume( args.resume_checkpoint, model.module, model.optimizer, scheduler=model.lr_scheduler, strict=(not args.resume_non_strict)) else: model.module, _, _, start_epoch = resume( args.resume_checkpoint, model.module, optimizer=None, strict=(not args.resume_non_strict)) else: if args.resume_optimizer: model, optimizer, scheduler, start_epoch = resume( args.resume_checkpoint, model, optimizer, scheduler=scheduler, strict=(not args.resume_non_strict)) else: model, _, _, start_epoch = resume( args.resume_checkpoint, model, optimizer=None, strict=(not args.resume_non_strict)) # save dataset statistics if args.local_rank == 0: train_dataset.save_statistics(save_dir) val_dataset.save_statistics(save_dir) # main training loop avg_meters = { 'kl_avg_meter': AverageValueMeter(), 'l2_avg_meter': AverageValueMeter() } assert args.distributed epoch = start_epoch print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs)) for epoch in range(start_epoch, args.epochs): if args.local_rank == 0: # evaluate on the validation set if epoch % args.val_freq == 0 and epoch != 0: model.eval() with torch.no_grad(): val_res = validate(model.module, args, val_loader, epoch, logger, save_dir) for k, v in val_res.items(): v = v.cpu().detach().item() send_slack(f'{k}:{v}, Epoch {epoch - 1}') if logger is not None and v is not None: logger.add_scalar(f'val_sample/{k}', v, epoch - 1) # train for one epoch train_one_epoch(epoch, model, criterion, optimizer, args, train_loader, avg_meters, logger) # Only on HEAD process if args.local_rank == 0: # save checkpoints if (epoch + 1) % args.save_freq == 0: if args.eval: validate_reconstruct_l2(epoch, val_loader, model, criterion, args, logger) save(model.module, model.optimizer, model.lr_scheduler, epoch + 1, Path(save_dir) / f'checkpoint-{epoch}.pt') save(model.module, model.optimizer, model.lr_scheduler, epoch + 1, Path(save_dir) / 'checkpoint-latest.pt') # save visualizations if (epoch + 1) % args.viz_freq == 0: with torch.no_grad(): visualize(model.module, args, val_loader, epoch, logger) # adjust the learning rate model.lr_scheduler.step() if logger is not None and args.local_rank == 0: logger.add_scalar('train lr', model.lr_scheduler.get_last_lr()[0], epoch) model.eval() if args.local_rank == 0: with torch.no_grad(): val_res = validate(model.module, args, val_loader, epoch, logger, save_dir) for k, v in val_res.items(): v = v.cpu().detach().item() send_slack(f'{k}:{v}, Epoch {epoch}') if logger is not None and v is not None: logger.add_scalar(f'val_sample/{k}', v, epoch) if logger is not None: logger.flush() logger.close()