def prepare_model_and_optimizer(args, device): # Prepare model config = BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) model = BertForPreTraining(config) checkpoint = None if not args.resume_from_checkpoint: global_step = 0 else: if args.resume_step == -1 and not args.init_checkpoint: model_names = [ f for f in os.listdir(args.output_dir) if f.endswith(".pt") ] args.resume_step = max([ int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names ]) global_step = args.resume_step if not args.init_checkpoint else 0 if not args.init_checkpoint: checkpoint = torch.load(os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu") else: checkpoint = torch.load(args.init_checkpoint, map_location="cpu") model.load_state_dict(checkpoint['model'], strict=False) if args.phase2: global_step -= args.phase1_end_step if is_main_process(): print("resume step from ", args.resume_step) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = FusedLAMB(optimizer_grouped_parameters, lr=args.learning_rate) lr_scheduler = PolyWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps) if args.fp16: if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic") else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale) amp._amp_state.loss_scalers[0]._loss_scale = 2**20 if args.resume_from_checkpoint: if args.phase2 or args.init_checkpoint: keys = list(checkpoint['optimizer']['state'].keys()) #Override hyperparameters from previous checkpoint for key in keys: checkpoint['optimizer']['state'][key]['step'] = global_step for iter, item in enumerate( checkpoint['optimizer']['param_groups']): checkpoint['optimizer']['param_groups'][iter][ 'step'] = global_step checkpoint['optimizer']['param_groups'][iter][ 't_total'] = args.max_steps checkpoint['optimizer']['param_groups'][iter][ 'warmup'] = args.warmup_proportion checkpoint['optimizer']['param_groups'][iter][ 'lr'] = args.learning_rate optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False) # Restore AMP master parameters if args.fp16: optimizer._lazy_init_maybe_master_weights() optimizer._amp_stash.lazy_init_called = True optimizer.load_state_dict(checkpoint['optimizer']) for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']): param.data.copy_(saved_param.data) if args.local_rank != -1: if not args.allreduce_post_accumulation: model = DDP( model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size()) else: flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0, )) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) return model, optimizer, lr_scheduler, checkpoint, global_step
def prepare_model_and_optimizer(args, device): global_step = 0 args.resume_step = 0 checkpoint = None config = BertConfig.from_json_file(args.bert_config_path) config.fused_mha = args.fused_mha config.fused_gelu_bias = args.fused_gelu_bias config.dense_seq_output = args.dense_seq_output config.unpad = args.unpad config.pad = args.pad config.fuse_qkv = not args.disable_fuse_qkv config.fuse_scale = not args.disable_fuse_scale config.fuse_mask = not args.disable_fuse_mask config.fuse_dropout = args.enable_fuse_dropout config.apex_softmax = not args.disable_apex_softmax config.enable_stream = args.enable_stream if config.fuse_mask == True: config.apex_softmax = True if config.pad == False: config.enable_stream = True if config.unpad == True: config.fused_mha = False # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) # Load from Pyt checkpoint - either given as init_checkpoint, or picked up from output_dir if found if args.init_checkpoint is not None or found_resume_checkpoint(args): # Prepare model model = BertForPreTraining(config) if args.init_checkpoint is None: # finding checkpoint in output_dir checkpoint_str = "phase2_ckpt_*.pt" if args.phase2 else "phase1_ckpt_*.pt" model_names = [f for f in glob.glob(os.path.join(args.output_dir, checkpoint_str))] global_step = max([int(x.split('.pt')[0].split('_')[-1].strip()) for x in model_names]) args.resume_step = global_step #used for throughput computation resume_init_checkpoint = os.path.join(args.output_dir, checkpoint_str.replace("*", str(global_step))) print("Setting init checkpoint to %s - which is the latest in %s" %(resume_init_checkpoint, args.output_dir)) checkpoint=torch.load(resume_init_checkpoint, map_location="cpu") else: checkpoint=torch.load(args.init_checkpoint, map_location="cpu")["model"] # Fused MHA requires a remapping of checkpoint parameters if config.fused_mha: checkpoint_remapped = remap_attn_parameters(checkpoint) model.load_state_dict(checkpoint_remapped, strict=False) else: model.load_state_dict(checkpoint, strict=True) else: #Load from TF Checkpoint model = BertForPreTraining.from_pretrained(args.init_tf_checkpoint, from_tf=True, config=config) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] mlperf_logger.log_event(key=mlperf_logger.constants.OPT_BASE_LR, value=args.learning_rate, sync=False) optimizer = FusedLAMB(optimizer_grouped_parameters, lr=args.learning_rate, betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2)) mlperf_logger.log_event(key='opt_epsilon', value=optimizer.defaults['eps'], sync=False) b1, b2 = optimizer.defaults['betas'] mlperf_logger.log_event(key='opt_lamb_beta_1', value=b1, sync=False) mlperf_logger.log_event(key='opt_lamb_beta_2', value=b2, sync=False) mlperf_logger.log_event(key='opt_lamb_weight_decay_rate', value=optimizer.defaults['weight_decay'], sync=False) if args.warmup_steps == 0: warmup_steps = int(args.max_steps * args.warmup_proportion) warmup_start = 0 else: warmup_steps = args.warmup_steps warmup_start = args.start_warmup_step lr_scheduler = LinearWarmupPolyDecayScheduler(optimizer, start_warmup_steps=warmup_start, warmup_steps=warmup_steps, total_steps=args.max_steps, end_learning_rate=0.0, degree=1.0) if args.fp16: if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic") else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale) amp._amp_state.loss_scalers[0]._loss_scale = float(os.getenv("INIT_LOSS_SCALE", 2**20)) if found_resume_checkpoint(args): optimizer.load_state_dict(checkpoint['optimizer']) #restores m,v states (only if resuming checkpoint, not for init_checkpoint and init_tf_checkpoint for now) # Restore AMP master parameters if args.fp16: optimizer._lazy_init_maybe_master_weights() optimizer._amp_stash.lazy_init_called = True optimizer.load_state_dict(checkpoint['optimizer']) for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']): param.data.copy_(saved_param.data) if args.local_rank != -1: if not args.allreduce_post_accumulation: model = DDP(model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size()) else: flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) ) return model, optimizer, lr_scheduler, checkpoint, global_step