def load_checkpoint(model, optimizer, lr_scheduler, args): """Load a model checkpoint.""" iteration, release, success = get_checkpoint_iteration(args) if not success: return 0 if args.deepspeed: raise NotImplemented("No installed deep speed") else: if args.load_openai: from utils import move_weights from model import DistributedDataParallel as DDP from fp16 import FP16_Module model_path = args.load from transformers import GPT2LMHeadModel print('global rank {} is loading openai weights {}'.format( torch.distributed.get_rank(), model_path)) model.cpu() gpt2model = GPT2LMHeadModel.from_pretrained( model_path, cache_dir='gpt2_weights') model2fill = model while isinstance(model2fill, (DDP, FP16_Module)): model2fill = model2fill.module move_weights(model2fill, gpt2model) model.cuda(torch.cuda.current_device()) sd = {} else: # Checkpoint. checkpoint_name = get_checkpoint_name(args.load, iteration, release) if mpu.get_data_parallel_rank() == 0: print('global rank {} is loading checkpoint {}'.format( torch.distributed.get_rank(), checkpoint_name)) sd = torch.load(checkpoint_name, map_location='cpu') if isinstance(model, torchDDP): model = model.module # Model. try: model.load_state_dict(sd['model']) except KeyError: print_rank_0( 'A metadata file exists but unable to load model ' 'from checkpoint {}, exiting'.format(checkpoint_name)) exit() # Optimizer. if not release and not args.finetune and not args.no_load_optim: try: if optimizer is not None: optimizer.load_state_dict(sd['optimizer']) if lr_scheduler is not None: lr_scheduler.load_state_dict(sd['lr_scheduler']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-optim or --finetune to prevent ' 'attempting to load the optimizer ' 'state.'.format(checkpoint_name)) exit() # Iterations. if args.finetune or release: iteration = 0 else: try: iteration = sd['iteration'] except KeyError: try: # Backward compatible with older checkpoints iteration = sd['total_iters'] except KeyError: print_rank_0( 'A metadata file exists but Unable to load iteration ' ' from checkpoint {}, exiting'.format(checkpoint_name)) exit() # rng states. if not release and not args.finetune and not args.no_load_rng: try: random.setstate(sd['random_rng_state']) np.random.set_state(sd['np_rng_state']) torch.set_rng_state(sd['torch_rng_state']) torch.cuda.set_rng_state(sd['cuda_rng_state']) mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states']) except KeyError: print_rank_0( 'Unable to load optimizer from checkpoint {}, exiting. ' 'Specify --no-load-optim or --finetune to prevent ' 'attempting to load the optimizer ' 'state.'.format(checkpoint_name)) exit() torch.distributed.barrier() if mpu.get_data_parallel_rank() == 0: print(' successfully loaded {}'.format(checkpoint_name)) return iteration
def main(): """Main training program.""" print('Evaluate GPT2 model') # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # Data stuff. eval_data = get_eval_data(args) # Model, optimizer, and learning rate. if args.eval_hf: from pytorch_pretrained_bert import GPT2LMHeadModel from pytorch_pretrained_bert import GPT2Model as HFGPT2Model if args.num_layers == 24: model_path = args.load #model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M' hfmodel = HFGPT2Model.from_pretrained(model_path, cache_dir='gpt2_weights', from_tf=True).cuda() model = GPT2LMHeadModel(hfmodel.config) model.transformer.load_state_dict(hfmodel.state_dict()) model.cuda() else: model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='gpt2_weights').cuda() else: if args.load_openai: from utils import move_weights model_path = args.load args.load = None model = setup_model(args) from pytorch_pretrained_bert import GPT2LMHeadModel from pytorch_pretrained_bert import GPT2Model as HFGPT2Model model_path = 'gpt2' from_tf = False print('loading openai weights') model.cpu() if args.num_layers == 24: #model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M' hfmodel = HFGPT2Model.from_pretrained(model_path, cache_dir='gpt2_weights', from_tf=True) gpt2model = GPT2LMHeadModel(hfmodel.config) gpt2model.transformer.load_state_dict(hfmodel.state_dict()) gpt2model else: gpt2model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='gpt2_weights') model2fill = model while isinstance(model2fill, (DDP, FP16_Module)): model2fill = model2fill.module move_weights(model2fill, gpt2model) model.cuda() else: model = setup_model(args) # Run on test data. prefix = "wiki" #os.path.basename(args.valid_data) evaluate_and_print_results(prefix, eval_data, model, args, timers)