def main(): parser = ArgumentParser() parser.add_argument("--model_file", type=str, required=True, help="Path to source .nemo file") parser.add_argument("--target_file", type=str, required=True, help="Path to write target .nemo file") parser.add_argument("--tensor_model_parallel_size", type=int, required=True, help="TP size of source model") parser.add_argument("--target_tensor_model_parallel_size", type=int, required=True, help="TP size of target model") parser.add_argument( "--model_class", type=str, default= "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel", help= "NeMo model class. This script should support all NeMo megatron models that use Tensor Parallel", ) parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag") args = parser.parse_args() precision = args.precision if args.precision in ["32", "16"]: precision = int(float(args.precision)) tp_size = args.tensor_model_parallel_size tgt_tp_size = args.target_tensor_model_parallel_size cls = model_utils.import_class_by_path(args.model_class) trainer = Trainer(devices=1, plugins=NLPDDPPlugin(), accelerator="cpu", precision=precision) app_state = AppState() app_state.data_parallel_rank = 0 app_state.pipeline_model_parallel_size = 1 # not supported yet in this script app_state.tensor_model_parallel_size = tp_size app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size if tp_size > 1: partitions = [] for i in range(tp_size): app_state.tensor_model_parallel_rank = i model = cls.restore_from(restore_path=args.model_file, trainer=trainer, map_location=torch.device("cpu")) params = [p for _, p in model.named_parameters()] partitions.append(params) # app_state is being updated incorrectly during restore app_state.data_parallel_rank = 0 app_state.pipeline_model_parallel_size = 1 # not supported yet in this script app_state.tensor_model_parallel_size = tp_size app_state.model_parallel_size = ( app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size) model.cfg.tensor_model_parallel_size = 1 app_state.model_parallel_size = 1 trainer = Trainer(devices=1, plugins=NLPDDPPlugin(), accelerator="cpu", precision=precision) model = cls(model.cfg, trainer).to('cpu') model._save_restore_connector = NLPSaveRestoreConnector() if tgt_tp_size > 1: merge_partition(model, partitions) else: merge_partition(model, partitions, args.target_file) else: app_state.model_parallel_size = 1 model = cls.restore_from(restore_path=args.model_file, trainer=trainer) if tgt_tp_size > 1: partitions = [] params = [p for _, p in model.named_parameters()] partitions.append(params) model.cfg.tensor_model_parallel_size = tgt_tp_size app_state.model_parallel_size = tgt_tp_size trainer = Trainer(devices=1, plugins=NLPDDPPlugin(), accelerator="cpu", precision=precision) model = cls(model.cfg, trainer).to('cpu') model._save_restore_connector = NLPSaveRestoreConnector() split_partition(model, partitions, tgt_tp_size, args.target_file) logging.info("Successfully finished changing partitions!")
def split_partition(model, partitions, tp_size, write_path=None): if len(partitions) != 1: raise ValueError( "Can only split partitions of model with TP=1. For partitions of models with TP>1, merge first." ) if tp_size < 1: raise ValueError("TP size must to be >= 1.") app_state = AppState() app_state.data_parallel_rank = 0 app_state.pipeline_model_parallel_size = 1 # not supported yet in this script app_state.tensor_model_parallel_size = tp_size app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size app_state.tensor_model_parallel_rank = tp_size - 1 idx = 0 splits = [] for _, param in model.named_parameters(): if param.shape == partitions[0][idx].shape: split = [partitions[0][idx].data] * tp_size elif param.shape[0] == partitions[0][idx].shape[0]: split = torch.split(partitions[0][idx].data, param.shape[-1], dim=-1) else: split = torch.split(partitions[0][idx].data, param.shape[0], dim=0) splits.append(split) idx += 1 for i in range(tp_size - 1, -1, -1): app_state.tensor_model_parallel_rank = i idx = 0 for name, param in model.named_parameters(): split_val = splits[idx][i] if param.shape != split_val.shape: logging.info( f"Warning: Shape mismatch for parameter {name} required shape: {param.shape}, split shape: {split_val.shape}. Padding to match required size." ) if split_val.shape[1:] == param.shape[1:]: pad = [0, 0] * len(split_val.shape) pad[-1] = param.shape[0] - split_val.shape[0] split_val = torch.nn.functional.pad( split_val, pad, 'constant') elif split_val.shape[:-1] == param.shape[:-1]: pad = [0, param.shape[-1] - split_val.shape[-1]] split_val = torch.nn.functional.pad( split_val, pad, 'constant') else: raise RuntimeError( f"Can not handle parameter {name}, required shape: {param.shape}, split shape: {split_val.shape}." ) param.data = split_val idx += 1 if write_path is not None: model.save_to(write_path)
def main(): parser = ArgumentParser() # args for loading the model, either from .nemo file or from PTL checkpoint parser.add_argument("--model_file", type=str, default="", required=False, help="Pass path to model's .nemo file") parser.add_argument( "--checkpoint_dir", type=str, default=None, required=False, help= "If not using a .nemo file. Path to PTL checkpoints saved during training. Ex: /raid/nemo_experiments/megatron_gpt/checkpoints", ) parser.add_argument( "--checkpoint_name", type=str, default=None, required=False, help= "If not using a .nemo file. Name of checkpoint to be used. Ex: megatron_gpt--val_loss=6.34-step=649-last.ckpt", ) parser.add_argument( "--hparams_file", type=str, default=None, required=False, help= "If not using a .nemo file. Path to config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", ) parser.add_argument("--tensor_model_parallel_size", type=int, default=1, required=False, help="Needed if not using a .nemo file") parser.add_argument( "--pipeline_model_parallel_size", type=int, default=1, required=False, help="Needed if not using a .nemo file", ) # PTL Trainer args parser.add_argument("--devices", default=1, type=int, help="PyTorch Lightning Trainer devices flag") parser.add_argument("--num_nodes", default=1, type=int, help="PyTorch Lightning Trainer num_nodes flag") parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag") # evaluation args parser.add_argument("--path_to_file", type=str, default="", required=False, help="Path to file with prompts (a text to complete)") parser.add_argument("--prompt", type=str, default="", required=False, help="Prompt for the model (a text to complete)") parser.add_argument("--use_soft_prompts", action="store_true", help="Use model's existing soft prompts") parser.add_argument("--prompt_tag", type=str, default="", required=False, help="Prompt tag string for task specific soft prompt") parser.add_argument("--tokens_to_generate", type=int, default="1", required=False, help="How many tokens to add to prompt") parser.add_argument( "--stop_after_sentence", type=bool, default="True", required=False, help= "True/False: whether to stop after full sentence has been generated.", ) parser.add_argument("--batch_size", default=1, type=int, required=False, help="Evaluation batch_size") parser.add_argument("--compute_logprobs", type=bool, default=False, required=False, help="Method for logprobs computation") args = parser.parse_args() assert ( args.devices * args.num_nodes == args.tensor_model_parallel_size * args.pipeline_model_parallel_size ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" if args.model_file and args.checkpoint_dir: raise ValueError( "Only one of model_file or checkpoint_dir should be used") # cast precision to int if 32 or 16 if args.precision in ["32", "16"]: args.precision = int(float(args.precision)) # trainer required for restoring model parallel models trainer = Trainer( plugins=[NLPDDPPlugin()], devices=args.devices, num_nodes=args.num_nodes, accelerator='gpu', precision=args.precision, ) if args.model_file: model = MegatronGPTModel.restore_from(restore_path=args.model_file, trainer=trainer) elif args.checkpoint_dir: app_state = AppState() if args.tensor_model_parallel_size > 1 or args.pipeline_model_parallel_size > 1: app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size app_state.tensor_model_parallel_size = args.tensor_model_parallel_size app_state.model_parallel_size = args.tensor_model_parallel_size * args.pipeline_model_parallel_size ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, app_state.model_parallel_size, _, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, tensor_model_parallel_size_=app_state. tensor_model_parallel_size, pipeline_model_parallel_size_=app_state. pipeline_model_parallel_size, ) # inject model parallel rank checkpoint_path = inject_model_parallel_rank( os.path.join(args.checkpoint_dir, args.checkpoint_name)) model = MegatronGPTModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) model.freeze() def pad_collate(batch): tokens, tokens_to_generate = batch[0]['data'], batch[0][ 'tokens_to_generate'] compute_logprobs = batch[0]['compute_logprobs'] lens = [len(token) for token in tokens] tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=50256) data = [] if 'prompt_tags' in batch[0]: # Keep track of soft prompt tags prompt_tags = batch[0]['prompt_tags'] for token, lenn, prompt_tag in zip(tokens_pad.T, lens, prompt_tags): data.append((token, lenn, tokens_to_generate, compute_logprobs, prompt_tag)) else: for token, lenn in zip(tokens_pad.T, lens): data.append( (token, lenn, tokens_to_generate, compute_logprobs)) return data # defining type of request if args.path_to_file != "": request = [] prompts = open(args.path_to_file, 'r', encoding='utf-8') for prompt in prompts.readlines(): prompt = prompt.split('\n')[0] if args.use_soft_prompts and model.use_soft_prompts: prompt = json.loads(prompt) request.append(prompt) dataset = GPTRequestDataset(request, model.tokenizer, args.tokens_to_generate, args.compute_logprobs) request_dl = DataLoader(dataset=pad_collate(dataset), batch_size=int(args.batch_size)) else: if args.use_soft_prompts and model.use_soft_prompts: request = [{'prompt_tag': args.prompt_tag, 'text': args.prompt}] else: request = [args.prompt] dataset = GPTRequestDataset(request, model.tokenizer, args.tokens_to_generate, args.compute_logprobs) request_dl = DataLoader(dataset=pad_collate(dataset), batch_size=1) # For GPT models that have had soft prompt tuning but you don't want to use any soft prompts if not args.use_soft_prompts and model.use_soft_prompts: model.use_soft_prompts = False response = trainer.predict(model, request_dl) print("***************************") print(response) print("***************************") if args.prompt and not args.compute_logprobs: print(f'Prompt: {args.prompt}\n\nResponse: {response[0][0][0]}')