def test_cpu_adam_opt(model_size): from deepspeed.ops.adam import DeepSpeedCPUAdam device = 'cpu' rng_state = torch.get_rng_state() param = torch.nn.Parameter(torch.randn(model_size, device=device)) torch.set_rng_state(rng_state) param1 = torch.nn.Parameter(torch.randn(model_size, device=device)) torch.set_rng_state(rng_state) param2_data = torch.randn(model_size, device=device).cuda() param2 = torch.nn.Parameter(param2_data) optimizer1 = torch.optim.AdamW([param1]) optimizer2 = FusedAdam([param2]) optimizer = DeepSpeedCPUAdam([param]) for i in range(10): rng_state = torch.get_rng_state() param.grad = torch.randn(model_size, device=device) torch.set_rng_state(rng_state) param1.grad = torch.randn(model_size, device=device) torch.set_rng_state(rng_state) param2.grad = torch.randn(model_size, device=device).cuda() optimizer.step() optimizer2.step() optimizer1.step() check_equal(param, param1, atol=1e-2, verbose=True) check_equal(param, param2.cpu(), atol=1e-2, verbose=True)
def test_cpu_adam_gpu_error(): model_size = 64 from deepspeed.ops.adam import DeepSpeedCPUAdam device = 'cuda:0' param = torch.nn.Parameter(torch.randn(model_size, device=device)) optimizer = DeepSpeedCPUAdam([param]) param.grad = torch.randn(model_size, device=device) with pytest.raises(AssertionError): optimizer.step()
def configure_optimizers(self): no_decay = ["bias", "LayerNorm.weight"] params_decay = [ p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) ] params_nodecay = [ p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) ] optim_groups = [ { "params": params_decay, "weight_decay": self.hparams.weight_decay }, { "params": params_nodecay, "weight_decay": 0.0 }, ] # todo: need to enable deepspeed cpu adam only if offloading if self.deepspeed_offload: return DeepSpeedCPUAdam(optim_groups, lr=self.hparams.learning_rate, betas=self.hparams.betas) return FusedAdam(optim_groups, lr=self.hparams.learning_rate, betas=self.hparams.betas)
def test_cpu_adam_opt(model_size): device = 'cpu' rng_state = torch.get_rng_state() param = torch.nn.Parameter(torch.randn(model_size, device=device)) torch.set_rng_state(rng_state) param1 = torch.nn.Parameter(torch.randn(model_size, device=device)) optimizer1 = torch.optim.Adam([param1]) optimizer = DeepSpeedCPUAdam([param]) for i in range(10): rng_state = torch.get_rng_state() param.grad = torch.randn(model_size, device=device) torch.set_rng_state(rng_state) param1.grad = torch.randn(model_size, device=device) optimizer.step() optimizer1.step() check_equal(param, param1, atol=1e-2, verbose=True)
def test_cpu_adam_opt(dtype, model_size): if ("amd" in pytest.cpu_vendor) and (dtype == torch.half): pytest.skip("cpu-adam with half precision not supported on AMD CPUs") from deepspeed.ops.adam import DeepSpeedCPUAdam device = 'cpu' rng_state = torch.get_rng_state() param = torch.nn.Parameter( torch.randn(model_size, device=device).to(dtype)) torch.set_rng_state(rng_state) param1_data = torch.randn(model_size, device=device) param1 = torch.nn.Parameter(param1_data) torch.set_rng_state(rng_state) param2_data = torch.randn(model_size, device=device).to(dtype).cuda() param2 = torch.nn.Parameter(param2_data) optimizer1 = torch.optim.AdamW([param1]) optimizer2 = FusedAdam([param2]) optimizer = DeepSpeedCPUAdam([param]) for i in range(10): rng_state = torch.get_rng_state() param.grad = torch.randn(model_size, device=device).to(dtype) torch.set_rng_state(rng_state) param1.grad = torch.randn(model_size, device=device) torch.set_rng_state(rng_state) param2.grad = torch.randn(model_size, device=device).to(dtype).cuda() optimizer.step() optimizer2.step() optimizer1.step() tolerance = param1.float().norm().detach().numpy() * 1e-2 check_equal(param.float().norm(), param1.float().norm(), atol=tolerance, verbose=True) check_equal(param.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True)
def get_model_tokenizer_optimizer(args): model, tokenizer, _ = build_model(args) model.half() model.cuda(args.local_rank) # XXX: all change to model parameters # (e.g. add_special_tokens) # must happen before DDP !! model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) model_obj = model.module if args.freeze_body: model_obj.transformer.requires_grad_(False) model_obj.transformer.wpe.requires_grad_(True) model_obj.transformer.emb_norm.requires_grad_(True) model_obj.lm_head.requires_grad_(True) params = [ dict(params=v) for v in [ # wte is tie with lm_head, no need run requires_grad_ # don't put wte in optim, params can't dup, # and autodiff will calc grads two times on params in lm_head # model.module.transformer.wte.parameters(), model_obj.transformer.wpe.parameters(), model_obj.transformer.emb_norm.parameters(), model_obj.lm_head.parameters() ] ] else: model.requires_grad_(True) params = model_obj.parameters() optimizer = DeepSpeedCPUAdam(params, lr=args.lr, weight_decay=0.01) return model, tokenizer, optimizer
def main(arch="bert-base-uncased", config="gpu.json"): # Reference: # # * https://github.com/huggingface/nlp/blob/master/notebooks/Overview.ipynb with open(config) as fin: config_params = json.load(fin) dataset = nlp.load_dataset('glue', "sst2") print(set([x['label'] for x in dataset["train"]])) tokenizer = BertTokenizerFast.from_pretrained(arch) # Format our dataset to outputs torch.Tensor to train a pytorch model columns = ['input_ids', 'token_type_ids', 'attention_mask', "label"] for subset in ("train", "validation"): dataset[subset] = dataset[subset].map(partial(convert_to_features, tokenizer), batched=True) dataset[subset].set_format(type='torch', columns=columns) print(tokenizer.decode(dataset['train'][6]["input_ids"].numpy())) print(dataset['train'][0]["attention_mask"]) valid_idx, test_idx = train_test_split(list( range(len(dataset["validation"]))), test_size=0.5, random_state=42) train_dict = { "input_ids": dataset['train']["input_ids"], "attention_mask": dataset['train']["attention_mask"], "token_type_ids": dataset['train']["token_type_ids"], "label": dataset['train']["label"] } valid_dict = { "input_ids": dataset['validation']["input_ids"][valid_idx], "attention_mask": dataset['validation']["attention_mask"][valid_idx], "token_type_ids": dataset['validation']["token_type_ids"][valid_idx], "label": dataset['validation']["label"][valid_idx] } test_dict = { "input_ids": dataset['validation']["input_ids"][test_idx], "attention_mask": dataset['validation']["attention_mask"][test_idx], "token_type_ids": dataset['validation']["token_type_ids"][test_idx], "label": dataset['validation']["label"][test_idx] } # Instantiate a PyTorch Dataloader around our dataset train_loader = torch.utils.data.DataLoader( SST2Dataset(train_dict), batch_size=config_params["train_batch_size"], shuffle=True) valid_loader = torch.utils.data.DataLoader( SST2Dataset(valid_dict), batch_size=config_params["train_batch_size"], drop_last=False) test_loader = torch.utils.data.DataLoader( SST2Dataset(test_dict), batch_size=config_params["train_batch_size"], drop_last=False) model = BertForSequenceClassification.from_pretrained(arch) # torch.nn.init.kaiming_normal_(model.classifier.weight) # torch.nn.init.constant_(model.classifier.bias, 0) # torch.nn.init.kaiming_normal_(model.bert.pooler.dense.weight) # torch.nn.init.constant_(model.bert.pooler.dense.bias, 0); args = Object() setattr(args, "local_rank", 0) setattr(args, "deepspeed_config", config) if config[:3] == "cpu": if "optimizer" in config_params: model, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) else: from deepspeed.ops.adam import DeepSpeedCPUAdam optimizer = DeepSpeedCPUAdam(model.parameters(), lr=2e-5) model, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters(), optimizer=optimizer) else: model, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters() # optimizer=optimizer ) total_steps = len(train_loader) * 3 # checkpoints = CheckpointCallback( # keep_n_checkpoints=1, # checkpoint_dir=CACHE_DIR / "model_cache/", # monitor_metric="accuracy" # ) lr_durations = [int(total_steps * 0.2), int(np.ceil(total_steps * 0.8))] break_points = [0] + list(np.cumsum(lr_durations))[:-1] callbacks = [ MovingAverageStatsTrackerCallback(avg_window=len(train_loader) // 8, log_interval=len(train_loader) // 10), LearningRateSchedulerCallback( MultiStageScheduler([ LinearLR(optimizer, 0.01, lr_durations[0]), CosineAnnealingScheduler(optimizer, lr_durations[1]) ], start_at_epochs=break_points)), # checkpoints ] bot = SST2Bot( model=model, train_loader=train_loader, valid_loader=valid_loader, clip_grad=10., optimizer=optimizer, echo=True, criterion=torch.nn.CrossEntropyLoss(), callbacks=callbacks, pbar=False, use_tensorboard=False, # use_amp=APEX_AVAILABLE, metrics=(Top1Accuracy(), )) print(total_steps) bot.train(total_steps=total_steps, checkpoint_interval=len(train_loader) // 2) # bot.load_model(checkpoints.best_performers[0][1]) # checkpoints.remove_checkpoints(keep=0) # TARGET_DIR = CACHE_DIR / "sst2_bert_uncased" # TARGET_DIR.mkdir(exist_ok=True) # bot.model.save_pretrained(TARGET_DIR) bot.eval(valid_loader) bot.eval(test_loader)
def configure_optimizers(self): return DeepSpeedCPUAdam(self.parameters())
import torch from deepspeed.ops.adam import DeepSpeedCPUAdam import time device = 'cpu' model_size = 1 * 1024**3 group_size = [model_size, 274432] param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size] optimizer = DeepSpeedCPUAdam(param) #torch.set_num_threads(128) for i, p in enumerate(param): p.grad = torch.ones(group_size[i], device=device) #param.grad = torch.ones(model_size, device=device) avg = 0 for i in range(100): start = time.time() optimizer.step() stop = time.time() avg += (stop - start) for i, p in enumerate(param): p.grad = torch.ones(group_size[i], device=device) * 2 #param.grad = torch.ones(model_size, device=device) * 2 print("Elapsed Time is ", avg / 100)
def configure_optimizers(self): base_parameters = [] lm_decay_parameters = [] lm_no_decay_parameters = [] if self.hparams.optim_params.optimizer == "radam": for parameter_name, parameter in self.named_parameters(): if "transformer" not in parameter_name: base_parameters.append(parameter) elif not any(v in parameter_name for v in ["bias", "LayerNorm.weight"]): lm_decay_parameters.append(parameter) else: lm_no_decay_parameters.append(parameter) optimizer_params = [ { "params": base_parameters, "weight_decay": self.hparams.optim_params.weight_decay, }, { "params": lm_decay_parameters, "lr": self.hparams.optim_params.lm_lr, "weight_decay": self.hparams.optim_params.lm_weight_decay, }, { "params": lm_no_decay_parameters, "lr": self.hparams.optim_params.lm_lr, "weight_decay": 0.0, }, ] optimizer = RAdam(optimizer_params, lr=self.hparams.optim_params.lr) elif self.hparams.optim_params.optimizer == "fuseadam": try: from deepspeed.ops.adam import FusedAdam except ImportError: raise ImportError( "Please install DeepSpeed (`pip install deepspeed`) to use FuseAdam optimizer." ) optimizer = FusedAdam(self.parameters()) elif self.hparams.optim_params.optimizer == "deepspeedcpuadam": try: from deepspeed.ops.adam import DeepSpeedCPUAdam except ImportError: raise ImportError( "Please install DeepSpeed (`pip install deepspeed`) to use DeepSpeedCPUAdam optimizer." ) optimizer = DeepSpeedCPUAdam(self.parameters()) elif self.hparams.optim_params.optimizer == "adafactor": optimizer = Adafactor( self.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=self.hparams.optim_params.lm_lr, ) else: raise ValueError(f"Unknown optimizer {self.hparams.optim_params.optimizer}") return optimizer
import torch from deepspeed.ops.adam import DeepSpeedCPUAdam import time device = 'cpu' model_size = 1 * 1024**3 param = torch.nn.Parameter(torch.ones(model_size, device=device)) param_fp16 = torch.nn.Parameter( torch.ones(model_size, dtype=torch.half, device='cuda:0')) optimizer = DeepSpeedCPUAdam([param]) #torch.set_num_threads(128) param.grad = torch.ones(model_size, device=device) avg = 0 for i in range(100): start = time.time() optimizer.step(fp16_param_groups=[param_fp16]) stop = time.time() avg += (stop - start) param.grad = torch.ones(model_size, device=device) * 2 print("Elapsed Time is ", avg / 100)
def __init__(self, cfg: DictConfig): self.shard_id = cfg.local_rank if cfg.local_rank != -1 else 0 self.distributed_factor = cfg.distributed_world_size or 1 logger.info("***** Initializing components for training *****") # if model file is specified, encoder parameters from saved state should be used for initialization model_file = get_model_file(cfg, cfg.checkpoint_file_name) saved_state = None if model_file: saved_state = load_states_from_checkpoint(model_file) set_cfg_params_from_state(saved_state.encoder_params, cfg) tensorizer, model, optimizer = init_biencoder_components( cfg.encoder.encoder_model_type, cfg ) if cfg.deepspeed: model.half() # XXX #no_decay = ["bias", "LayerNorm.weight"] # #optimizer_grouped_parameters = [ # { # "params": [ # p # for n, p in model.named_parameters() # if not any(nd in n for nd in no_decay) # ], # "weight_decay": cfg.train.weight_decay, # }, # { # "params": [ # p # for n, p in model.named_parameters() # if any(nd in n for nd in no_decay) # ], # "weight_decay": 0.0, # }, #] optimizer = DeepSpeedCPUAdam(optimizer.param_groups, lr=cfg.train.learning_rate, weight_decay=cfg.train.weight_decay) model, optimizer = setup_for_distributed_mode( model, optimizer, cfg.device, cfg.n_gpu, cfg.local_rank, cfg.fp16, cfg.fp16_opt_level, ) self.biencoder = model self.optimizer = optimizer self.tensorizer = tensorizer self.start_epoch = 0 self.start_batch = 0 self.scheduler_state = None self.best_validation_result = None self.best_cp_name = None self.cfg = cfg self.ds_cfg = BiencoderDatasetsCfg(cfg) if saved_state: self._load_saved_state(saved_state) self.dev_iterator = None