def get_model(deepspeed_config_path): num_local_heads = 16 sparse_mode = 'alternating' deepspeed_sparsity_config = get_sparse_attention_config( deepspeed_config_path, num_local_heads) if deepspeed_sparsity_config is not None: logger.info(f"Use sparse attention with mode {sparse_mode}") else: logger.info(f"Use dense attention") model = GPT2Model(num_layers=24, vocab_size=50264, hidden_size=2048, num_attention_heads=num_local_heads, embedding_dropout_prob=0.1, attention_dropout_prob=0.1, output_dropout_prob=0.1, max_sequence_length=2048, checkpoint_activations=False, checkpoint_num_layers=1, parallel_output=False, deepspeed_sparsity_config=deepspeed_sparsity_config, sparse_mode=sparse_mode) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. model = FP16_Module(model) return model
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=False) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. model = DDP(model) return model
def test_one_to_many(task_load): score_dicts = [] for ep in range(args.n_train_epochs[task_load]): model_dir = get_model_dir([task_load]) model_path = os.path.join(model_dir, 'model-{}'.format(ep+1)) config_path = os.path.join(model_dir,CONFIG_NAME) gen_token = get_gen_token(task_load) TOKENIZER.add_tokens([gen_token]) SPECIAL_TOKENS[task_load] = gen_token SPECIAL_TOKEN_IDS[task_load] = TOKENIZER.convert_tokens_to_ids(gen_token) model_config = CONFIG_CLASS.from_json_file(config_path) model = MODEL_CLASS(model_config).cuda().eval() state_dict = torch.load(model_path, map_location='cuda:0') model.load_state_dict(state_dict) if not args.fp32: model = FP16_Module(model) model.ep = ep model.model_dir = model_dir logger.info("task: {}, epoch: {}".format(task_load, ep+1)) score_dict = {k:None for k in args.tasks} with torch.no_grad(): for task_eval in args.tasks: test_one_to_one(task_load, task_eval, model, score_dict) logger.info("score: {}".format(score_dict)) score_dicts.append(score_dict) with open(os.path.join(model_dir, "metrics.json"),"w") as f: json.dump(score_dicts, f)
def get_model(tokenizer, args): """Build the model.""" print('building BERT model ...') model = BertModel(tokenizer, args) print(' > number of parameters: {}'.format( sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: print("fp16 mode") model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() model.module.model.bert.embeddings.position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if args.world_size > 1: model = DDP(model) return model
def get_model(args, version=None): """Build the model.""" print_rank_0('building Bert model ...') if version is None: model = BertMixtureModel(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, layernorm_epsilon=args.layernorm_epsilon, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, num_experts=args.num_experts, type_vocab_size=2) elif version == "v0": model = BertMixtureModel_v0(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, layernorm_epsilon=args.layernorm_epsilon, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, num_experts=args.num_experts, type_vocab_size=2) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) #To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def get_model(args): """Build the model.""" print_rank_0('building BERT model ...') model = BertModel(args) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() if args.ds_type=='BERT': model.module.model.bert.embeddings.position_embeddings.float() else: model.module.model.bert.embeddings.token_position_embeddings.float() model.module.model.bert.embeddings.para_position_embeddings.float() model.module.model.bert.embeddings.sent_position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if args.DDP_impl == 'torch': i = torch.cuda.current_device() args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel model = args.DDP_type(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': args.DDP_type = LocalDDP model = args.DDP_type(model) else: print_rank_0('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) exit() return model
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, max_memory_length=args.mem_length, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True, relative_encoding=args.transformer_xl) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if hasattr(args, "deepspeed") and args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if not args.deepspeed: if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def get_model(args): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if args.DDP_impl == 'torch': i = torch.cuda.current_device() args.DDP_type = torch.nn.parallel.distributed.DistributedDataParallel model = args.DDP_type(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': args.DDP_type = LocalDDP model = args.DDP_type(model) else: print_rank_0('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) exit() return model
def get_model(args): """Build the model.""" print_rank_0('building BERT model ...') model = BertModel(args) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) if args.fp32_embedding: model.module.model.bert.embeddings.word_embeddings.float() model.module.model.bert.embeddings.position_embeddings.float() model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_tokentypes: model.module.model.bert.embeddings.token_type_embeddings.float() if args.fp32_layernorm: for name, _module in model.named_modules(): if 'LayerNorm' in name: _module.float() # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def get_model(args, config, do_fp16=False): """Build the model.""" print_rank_0('building GPT2 model ...') model = GPT2Model(**config, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=True) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.deepspeed and do_fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if do_fp16: model = FP16_Module(model) # Wrap model for distributed training. if USE_TORCH_DDP: i = torch.cuda.current_device() model = DDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = DDP(model) return model
def setup_model_and_optim(args, train_data, tokenizer): ntokens = args.data_size if args.model.lower() == 'transformer': embed_tokens = m.Embedding( ntokens, args.decoder_embed_dim, padding_idx=tokenizer.command_name_map['pad'].Id) model = m.TransformerModel(m.DecoderPreprocessor(args, embed_tokens), m.TransformerDecoder(args, embed_tokens)) else: model = m.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) global rnn_model rnn_model = model LR_Warmer = None print('* number of parameters: %d' % sum([p.nelement() for p in model.parameters()])) if args.cuda: model.cuda() optim = None if args.load is not None and args.load != '': sd = torch.load(args.load, map_location='cpu') if args.load_optim: #optim_sd = torch.load(os.path.join(os.path.dirname(args.load), 'optim.pt'), map_location='cpu') rng = torch.load(os.path.join(os.path.dirname(args.load), 'rng.pt')) torch.cuda.set_rng_state(rng[0]) torch.set_rng_state(rng[1]) try: model.load_state_dict(sd) except: if hasattr(model, 'rnn'): apply_weight_norm(model.rnn, hook_child=False) else: apply_weight_norm(model, hook_child=False) model.load_state_dict(sd) remove_weight_norm(model) if not args.no_weight_norm: if hasattr(model, 'rnn'): apply_weight_norm(model.rnn, hook_child=False) else: apply_weight_norm(model, hook_child=False) if optim is None: optim_choice = 'Adam' if args.stlr_cut_frac else args.optim if args.fp16: model = FP16_Module(model) optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) optim = FP16_Optimizer(optim, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) else: optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) if args.load_optim: optim.load_state_dict(optim_sd) # add linear learning rate scheduler if train_data is not None: if args.constant_decay: num_iters = args.constant_decay else: num_iters = args.train_iters * args.epochs init_step = -1 if args.load_optim: #TODO: this no longer makes sense given the new data loaders init_step = optim_sd['iter'] - optim_sd['skipped_iter'] train_data.batch_sampler.start_iter = (optim_sd['iter'] % len(train_data)) + 1 warmup_iter = args.warmup * num_iters if args.stlr_cut_frac is not None: LR = SlantedTriangularLR(optim, cut_frac=args.stlr_cut_frac, num_iters=num_iters) else: LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iter, num_iters=num_iters, decay_style=args.decay_style) if args.warmup != 0: LR_Warmer = WarmupLR(optim, warmup_iter, last_iter=init_step) # wrap model for distributed training if args.world_size > 1: model = DDP(model) criterion = nn.CrossEntropyLoss(reduce=False) return model, optim, LR, LR_Warmer, criterion
def main(): args = parse_args() if args.shard_optimizer_state > 0 and not args.skip_full_optimizer: raise ValueError( "If shard_optimizer_state is enabled, skip_full_optimizer must also be enabled. Full optimizer saving is currently not supported under optimizer state sharding." ) if args.partition_assignment != "" and args.manual_partition == 0: print("[Warning] partition_assignment is set, enable manual_partition") args.manual_partition = 1 # any value here is overriden by the config set in notebook when launching the sagemaker job smp_config = { "ddp": True, "tensor_parallel_degree": args.tensor_parallel_degree, "pipeline_parallel_degree": args.pipeline_parallel_degree, "microbatches": args.microbatches, # if activation_checkpointing true checkpoints transformer layers below "checkpoint_attentions": False if args.activation_checkpointing else True, "shard_optimizer_state": args.shard_optimizer_state > 0, "prescaled_batch": args.prescaled_batch > 0, "offload_activations": args.offload_activations > 0, "optimize": args.optimize, "auto_partition": False if args.manual_partition else True, "default_partition": 0, "static_mode": args.static_mode > 0, "fast_mode": args.fast_mode > 0, } if args.smp_version < 110: smp_config["fp16_params"] = args.fp16 > 0 else: smp_config["fp16"] = args.fp16 > 0 smp_config["delayed_parameter_initialization"] = args.delayed_param > 0 smp_config["placement_strategy"] = args.placement_strategy smp_config[ "activation_loading_horizon"] = args.activation_loading_horizon smp_config["skip_tracing"] = args.skip_tracing > 0 if args.active_microbatches is not None: smp_config["active_microbatches"] = args.active_microbatches smp.init(smp_config) if smp.rank() == 0: print("Arguments:", args.__dict__) print(f"Transformers version: {transformers.__version__}") print( f"smdistributed.modelparallel version: {smdistributed.modelparallel.__version__}" ) print(f"smdistributed config: {smp_config}") if args.save_final_full_model and smp.rank() == 0: print( f"[Warning] Note that save_final_full_model only saves the final model at the end of all steps. It does not save optimizer state. Optimizer state is only saved with partial models which are saved at checkpointing_freq during training. If you want to restart training you need partial checkpoints." ) if args.partition_assignment != "": partition_assignment = args.partition_assignment.split(",") assert ( len(partition_assignment) == smp.pp_size() ), f"partition_assignment must have the same size as pipeline parallel degree, but getting {len(partition_assignment)} vs {smp.pp_size()}" if smp.rank() == 0 or (smp.local_rank() == 0 and args.use_fsx == 0): for path in [args.model_dir, args.checkpoint_dir]: if not os.path.exists(path): os.makedirs(path, exist_ok=True) model_config = GPT2Config( vocab_size=args.vocab_size, n_positions=args.max_context_width, n_embd=args.hidden_width, n_layer=args.num_layers, n_head=args.num_heads, n_inner=None, activation_function="gelu_new", resid_pdrop=args.resid_pdrop, embd_pdrop=args.embd_pdrop, attn_pdrop=args.attn_pdrop, layer_norm_epsilon=1e-05, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=args.summary_first_pdrop, # gradient_checkpointing=args.gradient_checkpointing > 0, use_cache=False, bos_token_id=50256, eos_token_id=50256, return_dict=True, ) # the following improves start-up time by skipping proper initialization # of weights in the original model. this is not a problem because DistributedModel # will override those weights anyway when tensor_parallel_degree > 1. if smp.tp_size() > 1: from transformers.modeling_utils import PreTrainedModel PreTrainedModel.init_weights = lambda x: None set_seed(args.seed) if args.enable_memory_profiling > 0: memory_status_cpu(msg="before model creation") if args.smp_version < 110: if args.fp16: torch.set_default_dtype(torch.float16) with smp.tensor_parallelism( enabled=smp.tp_size() > 1, attention_in_fp32=args.attention_in_fp32 > 0): with smp.delay_param_initialization( enabled=(smp.tp_size() > 1 and args.delayed_param > 0)): model = AutoModelForCausalLM.from_config(model_config) else: with smp.model_creation( tensor_parallelism=smp.tp_size() > 1, attention_in_fp32=args.attention_in_fp32 > 0, query_key_layer_scaling=args.query_key_layer_scaling > 0, fused_softmax=args.fused_softmax > 0, fused_bias_gelu=args.fused_bias_gelu > 0, dtype=torch.float16 if args.fp16 else torch.get_default_dtype(), ): model = AutoModelForCausalLM.from_config(model_config) if args.smp_version < 110 and args.fp16: model = FP16_Module(model) if args.enable_memory_profiling > 0: memory_status_cpu(msg="after model creation") num_params = sum([np.prod(p.size()) for p in model.parameters()]) if smp.rank() == 0: print(f"# total parameters: {num_params}") # smdistributed: Set the device to the GPU ID used by the current process. # Input tensors should be transferred to this device. torch.cuda.set_device(smp.local_rank()) device = torch.device("cuda") if not args.same_seed: # Set seed by tp_rank to prevent weights from being the same on different tp_ranks set_seed(args.seed + smp.tp_rank()) # smdistributed: Use the DistributedModel container to provide the model # to be partitioned across different ranks. For the rest of the script, # the returned DistributedModel object should be used in place of # the model provided for DistributedModel class instantiation. if args.smp_version < 110 and args.fp16: torch.set_default_dtype(torch.float16) if args.enable_memory_profiling > 0: memory_status_cpu(msg="before dist model creation") model = smp.DistributedModel(model, trace_device="gpu") if args.enable_memory_profiling > 0: memory_status_cpu(msg="after dist model creation") if args.smp_version < 110: if smp.tp_size() > 1: transformer_layers = model.module.module.module.transformer.seq_layers else: transformer_layers = model.module.module.module.transformer.h else: m = model.get_module() if smp.tp_size() > 1: transformer_layers = m.transformer.seq_layers else: transformer_layers = m.transformer.h if args.manual_partition: print(f"Manual partition enabled") if args.partition_assignment != "": get_num_layers = lambda x: int(partition_assignment[x]) total_layers = sum( [get_num_layers(pp_rank) for pp_rank in range(smp.pp_size())]) assert ( total_layers == args.num_layers ), f"partition_assignment must have the same total transformer layers as model, but getting {total_layers} vs {args.num_layers}" else: # evenly distribute layers across all partitions div, rem = divmod(args.num_layers, smp.pp_size()) get_num_layers = lambda x: (div + 1 if x >= smp.pp_size() - rem else div) assignments = [] # (TODO) This is required for 175B otherwise a hang for partition "8,17,17,18,18,18" # Need further investigation # for pp_rank in reversed(range(smp.pp_size())): for pp_rank in range(smp.pp_size()): nl = get_num_layers(pp_rank) print(f"{nl} layers assigned to partition {pp_rank}") assignments += [pp_rank for _ in range(nl)] for i, c in enumerate(transformer_layers.children()): smp.set_partition(c, assignments[i]) if args.smp_version < 110: iter_model = model # Build parameter groups (weight decay and non-decay). while isinstance(iter_model, (DistributedDataParallel, FP16_Module)): iter_model = iter_model.module else: iter_model = m param_groups = get_param_groups_by_weight_decay(iter_model) if args.use_adamw > 0: optimizer = optim.AdamW(param_groups, betas=(args.beta1, args.beta2), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.Adam(param_groups, betas=(args.beta1, args.beta2), lr=args.lr, weight_decay=args.weight_decay) if args.activation_checkpointing: kwargs = {} if isinstance(transformer_layers, nn.Sequential): kwargs["pack_args_as_tuple"] = True kwargs["strategy"] = args.activation_strategy smp.set_activation_checkpointing(transformer_layers, **kwargs) if args.smp_version < 110: optimizer = FP16_Optimizer( model, optimizer, static_loss_scale=None, dynamic_loss_scale=True, use_smp=True, dynamic_loss_args={ "scale_window": 1000, "min_scale": 1, "delayed_shift": 2 }, params_have_main_grad=False, shard_optimizer_state=args.shard_optimizer_state > 0, ) optimizer = smp.DistributedOptimizer(optimizer) model.register_post_step_hook( lambda model, optimizer: optimizer.init_master_params()) else: optimizer = smp.DistributedOptimizer( optimizer, static_loss_scale=None, dynamic_loss_scale=True, dynamic_loss_args={ "scale_window": 1000, "min_scale": 1, "delayed_shift": 2 }, ) lr_scheduler = get_learning_rate_scheduler(optimizer, args) if args.enable_memory_profiling > 0: model.register_post_partition_hook( lambda model, optimizer: memory_status(msg="After_partition")) # load after wrapping model and optimizer with smp Distributed... if args.load_full or args.load_partial: if args.load_partial and args.load_full: print( "Since both --load_partial and --load_full set, will try to load from full checkpoint." "If the intention is to load from partial checkpoint, please don't set --load_full" ) partial = not args.load_full path = args.checkpoint_dir if partial else args.model_dir translate_from_hf = not partial model, optimizer, total_steps, start_train_path_index, start_batch_index = load_model_and_optimizer( path, model, optimizer, lr_scheduler, partial, args, translate_from_hf=translate_from_hf, seq_length=args.max_context_width, load_model=True, load_optimizer=args.load_partial > 0, num_params=num_params, ) else: total_steps = 0 start_train_path_index = 0 start_batch_index = 0 start = time.time() total_steps, throughput, loss = train( model, optimizer, lr_scheduler, model_config, start_train_path_index, start_batch_index, num_params, total_steps, args, ) time_to_train = time.time() - start if args.ci: print(f"[SMP_METRIC]__GPT2__Time_to_train__{time_to_train}") print(f"[SMP_METRIC]__GPT2__samples/second__{throughput}") print(f"[SMP_METRIC]__GPT2__Loss__{loss}") if not args.load_partial and not args.load_full: assert time_to_train < args.time_to_train assert throughput > args.throughput if args.loss: assert loss < args.loss if args.save_final_full_model: # saves full model at the end base_path = f"trained_gpt_nparams-{num_params}_steps-{total_steps}.pt" out_path = os.path.join(args.model_dir, base_path) if smp.rdp_rank() == 0: save( out_path, model, optimizer, lr_scheduler, model_config, num_params, total_steps, -1, args, partial=False, translate_to_hf=smp.tp_size() > 1, seq_length=args.max_context_width, ) smp.barrier() if smp.rank() == 0: print("SMP training finished successfully")
rng = torch.load(os.path.join(os.path.dirname(args.load), 'rng.pt')) torch.cuda.set_rng_state(rng[0]) torch.set_rng_state(rng[1]) try: model.load_state_dict(sd) except: apply_weight_norm(model.rnn, hook_child=False) model.load_state_dict(sd) remove_weight_norm(model.rnn) if not args.no_weight_norm: apply_weight_norm(model, 'rnn', hook_child=False) # create optimizer and fp16 models if args.fp16: model = FP16_Module(model) optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) optim = FP16_Optimizer(optim, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) else: optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) if args.load_optim: pass optim.load_state_dict(optim_sd) # add linear learning rate scheduler if train_data is not None: if args.constant_decay: num_iters = args.constant_decay
def get_model(args, model_type=None, multi_token=True, num_labels=None): """Build the model.""" print_rank_0('building GLM model ...') output_predict, parallel_output = True, True if (model_type == "multiple_choice" or model_type == "classification") and not args.cloze_eval: output_predict = False if model_type is not None: parallel_output = False model = GLMModel(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, max_memory_length=args.mem_length, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=parallel_output, relative_encoding=args.transformer_xl, block_position_encoding=args.block_lm and not args.masked_lm, output_predict=output_predict) if model_type is not None: if model_type == 'cloze': if multi_token: if args.fast_decode: model = GLMForMultiTokenClozeFast(model, length_penalty=args.length_penalty) else: model = GLMForMultiTokenCloze(model, length_penalty=args.length_penalty) else: model = GLMForSingleTokenCloze(model) elif model_type == 'classification': model = GLMForSequenceClassification(model, args.hidden_size, args.output_dropout, args.pool_token, num_class=num_labels) elif model_type == 'generation': pass else: raise NotImplementedError(model_type) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if hasattr(args, "deepspeed") and args.deepspeed and args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if not args.deepspeed: if args.DDP_impl == 'torch': i = torch.cuda.current_device() model = TorchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) else: model = LocalDDP(model) return model
def train(task_ids, model): tasks = [args.tasks[task_id] for task_id in task_ids] logger.info("start to train { task: %s, seq train type: %s }" % (tasks, args.seq_train_type)) model_dir = get_model_dir(tasks) make_dir(model_dir) #train_dataset = [(TASK_DICT[t]["train"] if not args.seq_distil else TASK_DICT[t]["train"].replace("train", "distil")) for t in tasks] train_dataset = [ swap_name(TASK_DICT[t]["train"], args.seq_distil, args.ref1) for t in tasks ] train_extra_data = [] if "lll" in args.seq_train_type and task_ids[0] > 0 and not args.skip_tasks: prev_task = args.tasks[task_ids[0] - 1] with torch.no_grad(): create_extra_data(tasks[0], prev_task, model, train_extra_data) elif "gem" in args.seq_train_type and task_ids[0] > 0: get_real_data(tasks[0], train_extra_data, accum=False, encode=True) args.memory_data.append(train_extra_data) train_extra_data = [] logger.info('extra training data size: {}'.format(len(train_extra_data))) if not model: # which_model_to_load = model_dir if os.path.isfile(os.path.join(model_dir, FINAL_SAVE_NAME)) else args.model_name model = MODEL_CLASS.from_pretrained(args.model_name).cuda() model.resize_token_embeddings(len(TOKENIZER)) if not args.fp32: model = FP16_Module(model) gen_token = get_gen_token(tasks[0]) TOKENIZER.add_tokens([gen_token]) TOKENIZER.save_pretrained(model_dir) SPECIAL_TOKENS[tasks[0]] = gen_token SPECIAL_TOKEN_IDS[tasks[0]] = TOKENIZER.convert_tokens_to_ids(gen_token) logger.info('gen token = {} , gen token id = {}'.format( gen_token, SPECIAL_TOKEN_IDS[tasks[0]])) MODEL_CONFIG.vocab_size = len(TOKENIZER) MODEL_CONFIG.to_json_file(os.path.join(model_dir, CONFIG_NAME)) global TOKENS_WEIGHT if len(TOKENIZER) != TOKENS_WEIGHT.shape[0]: TOKENS_WEIGHT = torch.cat((TOKENS_WEIGHT, torch.ones([1]).cuda())) if args.skip_tasks and len(tasks) == 1: logger.info("*********** skip task: {} ***********".format(tasks[0])) if tasks[0] in args.skip_tasks: if len(args.skip_tasks) == 1: model_dir = get_model_dir(tasks) model_path = os.path.join(model_dir, FINAL_SAVE_NAME) config_path = os.path.join(model_dir, CONFIG_NAME) model_config = CONFIG_CLASS.from_json_file(config_path) model = MODEL_CLASS(model_config).cuda() state_dict = torch.load(model_path) model.load_state_dict(state_dict) if not args.fp32: model = FP16_Module(model) if args.seq_train_type in REG_TYPE_KEYS: logger.info("calulating reg_params ...") train_qadata = QADataset(train_dataset, "train", SPECIAL_TOKEN_IDS[tasks[0]], train_extra_data) max_train_batch_size = max( len(train_qadata) // args.min_n_steps, args.min_batch_size) train_dataloader = create_dataloader( train_qadata, "train", max_train_batch_size) parallel_model = DataParallelModel(WrapModel(model), args.device_ids) regularizer = REG_TYPES[args.seq_train_type]( model, parallel_model, [train_dataloader], tasks[0]) regularizer.task_start_do() regularizer.task_end_do() torch.save(model.state_dict(), os.path.join(model_dir, FINAL_SAVE_NAME)) logger.info("done reg_params!") args.skip_tasks.remove(tasks[0]) return model model.resize_token_embeddings( len(TOKENIZER) if not args.multitask_specific else len(TOKENIZER) + 4) if args.multitask_specific: for i in range(4): TOKENS_WEIGHT = torch.cat((TOKENS_WEIGHT, torch.ones([1]).cuda())) if args.distil: teacher_model = MODEL_CLASS.from_pretrained(args.model_name).cuda() teacher_vocab_size = json.load( open("models/gpt2/lll/{task}_0.2/{task}/config.json".format( task=tasks[0])))['vocab_size'] teacher_model.resize_token_embeddings(teacher_vocab_size) print("load teacher model from {}".format( "models/gpt2/lll/{task}_0.2/{task}/model-finish".format( task=tasks[0]))) teacher_model.load_state_dict( torch.load("models/gpt2/lll/{task}_0.2/{task}/model-finish".format( task=tasks[0]))) if not args.fp32: teacher_model = FP16_Module(teacher_model) teacher_model.eval() teacher_model = DataParallelModel(WrapModel(teacher_model), args.device_ids) if not args.fp32: # again because resize_token_embeddings makes embedding layer fp32 model = FP16_Module(model) parallel_model = DataParallelModel(WrapModel(model), args.device_ids) train_qadata = QADataset(train_dataset, "train", SPECIAL_TOKEN_IDS[tasks[0]], train_extra_data) max_train_batch_size = max( len(train_qadata) // args.min_n_steps, args.min_batch_size) train_dataloader = create_dataloader(train_qadata, "train", max_train_batch_size) if not args.unbound and args.seq_train_type not in [ "multitask", "multilm" ]: #n_train_epochs = TASK_DICT[tasks[0]]["n_train_epochs"] n_train_epochs = args.n_train_epochs[tasks[0]] else: n_train_epochs = args.n_train_epochs['_'.join(tasks)] n_train_optimization_steps = len(train_qadata) * n_train_epochs logger.info( 'len of train dataset: {} , max train batch size {} , num of opt steps: {}' .format(len(train_qadata), max_train_batch_size, n_train_optimization_steps)) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if "gem" in args.seq_train_type: model.task_id = task_ids[0] if not hasattr(model, "grad_dims"): model.grad_dims = [] for param in model.parameters(): model.grad_dims.append(param.data.numel()) if not hasattr(model, "grads"): model.grads = torch.zeros(sum(model.grad_dims), len(args.tasks)) model.grads = model.grads.cuda() if args.seq_train_type in REG_TYPE_KEYS: optimizer = Weight_Regularized_AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if not args.fp32: optimizer = FP16_Optimizer(optimizer, static_loss_scale=None, dynamic_loss_scale=True, dynamic_loss_args={ 'scale_window': 100, 'min_scale': 1, 'delayed_shift': 2 }) scheduler = AnnealingLR(optimizer, start_lr=args.learning_rate, warmup_iter=int(args.n_warmup_ratio * len(train_qadata)), num_iters=int(n_train_optimization_steps), decay_style=args.decay_style) train_loss_fct = DataParallelCriterion( CrossEntropyLoss(ignore_index=FILL_VAL, weight=TOKENS_WEIGHT), args.device_ids) if args.distil: kd_loss_fct = DataParallelCriterion( nn.KLDivLoss(reduction="batchmean"), args.device_ids) if args.seq_train_type in REG_TYPE_KEYS: copy_train_dataloader = create_dataloader(train_qadata, "train", max_train_batch_size) prev_task = args.tasks[task_ids[0] - 1] regularizer = REG_TYPES[args.seq_train_type](model, parallel_model, [copy_train_dataloader], tasks[0], prev_task) regularizer.task_start_do() tot_n_steps = 0 train_once = TrainStep(model, optimizer, scheduler) if "gem" in args.seq_train_type and task_ids[0] != 0: gem_step = GEMStep(model, parallel_model, train_loss_fct, optimizer) model.train() for ep in range(n_train_epochs): cum_loss, cum_qa_loss, cum_lm_loss, cur_n_inputs = 0, 0, 0, 0 for n_steps, (_, _, cqa, _, Y, gen_X, gen_Y, is_extra) in enumerate(train_dataloader): n_inputs = sum(_cqa.shape[0] for _cqa in cqa) if args.multitask_specific: for i in range(len(is_extra)): gen_X[i][:, 0] += is_extra[i] is_extra[i] = is_extra[i] * 0 for i in range(len(cqa)): cqa[i] = (cqa[i].to(args.device_ids[i]), ) Y[i] = Y[i].to(args.device_ids[i]) gen_X[i] = (gen_X[i].to(args.device_ids[i]), ) gen_Y[i] = gen_Y[i].to(args.device_ids[i]) is_extra[i] = is_extra[i].to(args.device_ids[i]) if args.distil: losses = get_distil_losses(teacher_model, parallel_model, cqa, Y, gen_X, gen_Y, is_extra, kd_loss_fct, train_loss_fct, args.temperature_kd, pad_idx=FILL_VAL) else: losses = get_losses(parallel_model, cqa, Y, gen_X, gen_Y, train_loss_fct) loss = sum(losses) if "gem" in args.seq_train_type and task_ids[0] != 0: gem_step(task_ids[0]) train_once(loss, n_inputs) qa_loss = losses[0].item() * n_inputs lm_loss = losses[1].item() * n_inputs cum_loss += (qa_loss + lm_loss) cum_qa_loss += qa_loss cum_lm_loss += lm_loss cur_n_inputs += n_inputs if (n_steps + 1) % args.logging_steps == 0: logger.info( 'progress {:.3f} , lr {:.1E} , loss {:.3f} , qa loss {:.3f} , lm loss {:.3f} , avg batch size {:.1f}' .format(ep + cur_n_inputs / len(train_qadata), scheduler.get_lr(), cum_loss / cur_n_inputs, cum_qa_loss / cur_n_inputs, cum_lm_loss / cur_n_inputs, cur_n_inputs / (n_steps + 1))) torch.save(model.state_dict(), os.path.join(model_dir, SAVE_NAME + str(ep + 1))) tot_n_steps += (n_steps + 1) logger.info( 'epoch {}/{} done , tot steps {} , lr {:.1E} , loss {:.2f} , qa loss {:.2f} , lm loss {:.2f} , avg batch size {:.1f}' .format(ep + 1, n_train_epochs, tot_n_steps, scheduler.get_lr(), cum_loss / cur_n_inputs, cum_qa_loss / cur_n_inputs, cum_lm_loss / cur_n_inputs, cur_n_inputs / (n_steps + 1))) # task end do for reg if args.seq_train_type in REG_TYPE_KEYS: regularizer.task_end_do() torch.save(model.state_dict(), os.path.join(model_dir, FINAL_SAVE_NAME)) return model
def get_model(args, model_type=None, multi_token=True, num_labels=None, spell_length=None): """Build the model.""" print_rank_0('building GPT2 model ...') if args.pretrained_bert: if model_type == "multiple_choice": model = BertForMultipleChoice.from_pretrained( args.tokenizer_model_type, cache_dir=args.cache_dir, fp32_layernorm=args.fp32_layernorm, fp32_embedding=args.fp32_embedding, layernorm_epsilon=args.layernorm_epsilon) elif model_type == "classification": model = BertForSequenceClassification.from_pretrained( args.tokenizer_model_type, cache_dir=args.cache_dir, fp32_layernorm=args.fp32_layernorm, fp32_embedding=args.fp32_embedding, layernorm_epsilon=args.layernorm_epsilon, num_labels=num_labels) else: raise NotImplementedError else: output_predict, paralle_output = True, True if (model_type == "multiple_choice" or model_type == "classification") and not args.cloze_eval: output_predict = False if model_type is not None: paralle_output = False if spell_length is not None: print_rank_0(f"Continuous spell length {spell_length}") model = GLMModel(num_layers=args.num_layers, vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, embedding_dropout_prob=args.hidden_dropout, attention_dropout_prob=args.attention_dropout, output_dropout_prob=args.hidden_dropout, max_sequence_length=args.max_position_embeddings, max_memory_length=args.mem_length, checkpoint_activations=args.checkpoint_activations, checkpoint_num_layers=args.checkpoint_num_layers, parallel_output=paralle_output, relative_encoding=args.transformer_xl, block_position_encoding=args.block_lm and not args.masked_lm, output_predict=output_predict, spell_length=spell_length, spell_func=args.prompt_func, attention_scale=args.attention_scale) if args.freeze_transformer: model.freeze_transformer( tune_prefix_layers=args.tune_prefix_layers) if model_type is not None: if model_type == 'multiple_choice': if args.cloze_eval: if multi_token: if args.fast_decode: model = GLMForMultiTokenClozeFast( model, length_penalty=args.length_penalty) else: model = GLMForMultiTokenCloze( model, length_penalty=args.length_penalty) else: model = GLMForSingleTokenCloze( model, take_softmax=args.adapet) else: model = GLMForSequenceClassification(model, args.hidden_size, args.output_dropout, args.pool_token, num_class=num_labels) elif model_type == 'classification': model = GLMForSequenceClassification(model, args.hidden_size, args.output_dropout, args.pool_token, num_class=num_labels) elif model_type == 'generation': pass else: raise NotImplementedError(model_type) if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on model parallel rank {}: {}'.format( mpu.get_model_parallel_rank(), sum([p.nelement() for p in model.parameters()])), flush=True) # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.fp16: model.half() # GPU allocation. model.cuda(torch.cuda.current_device()) # Fp16 conversion. if args.fp16: model = FP16_Module(model) # Wrap model for distributed training. if not args.deepspeed and (args.train_iters or args.epochs): if args.DDP_impl == 'torch': i = torch.cuda.current_device() model = TorchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) elif args.DDP_impl == 'local': model = LocalDDP(model) else: print_rank_0("Skip DDP model") return model