def get_gpt2_model(args_others, mp_size=1): from megatron.model import GPT2Model from megatron.initialize import initialize_megatron args_defaults = { 'vocab_file': get_test_path('gpt2-vocab.json'), 'merge_file': get_test_path('gpt2-merges.txt'), 'tokenizer_type': 'GPT2BPETokenizer', } args_defaults.update(args_others) # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing. sys.argv.extend([ '--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1) ]) initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) model = GPT2Model(num_tokentypes=0, parallel_output=False) model.cuda() from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron import mpu i = torch.cuda.current_device() model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) return model
def main(): """Main program.""" initialize_megatron(extra_args_provider=add_text_generate_args, args_defaults={ 'tokenizer_type': 'GPT2BPETokenizer', 'no_load_rng': True, 'no_load_optim': True }) args = get_args() if args.num_layers_per_virtual_pipeline_stage is not None: print( "Interleaved pipeline schedule is not yet supported for text generation." ) exit() # Set up model and load checkpoint. model = get_model(model_provider) if args.load is not None: _ = load_checkpoint(model, None, None) assert len(model) == 1, "Above condition should have caught this" model = model[0] # Generate samples. if args.num_samples == 0: args.micro_batch_size = 1 if args.sample_input_file != None: generate_samples_input_from_file(model) else: generate_samples_interactive(model) else: generate_and_write_samples_unconditional(model)
def setup_for_inference_or_eval(inference=True, get_key_value=True, overwrite_values=None): from megatron.neox_arguments import NeoXArgs from megatron.initialize import initialize_megatron from megatron.training import setup_model_and_optimizer _overwrite_values = { "checkpoint_activations": False, "partition_activations": False, "no_load_optim": True, } if overwrite_values: _overwrite_values.update(overwrite_values) neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values) neox_args.configure_distributed_args() neox_args.build_tokenizer() if neox_args.load is None: raise ValueError("`load` parameter must be supplied to load a model`") # initialize megatron initialize_megatron(neox_args) # set up model and load checkpoint. model, _, _ = setup_model_and_optimizer( neox_args=neox_args, inference=inference, get_key_value=get_key_value ) # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed print_rank_0('Finished loading model') return model, neox_args
def setup_for_inference_or_eval( use_cache=True, overwrite_values=None, ): """ Initializes the model for evaluation or inference (doesn't load optimizer states, etc.) from command line args. use_cache: bool Whether to use key value caching in inference. overwrite_values: dict Optional Values to overwrite in the model config. """ from megatron.neox_arguments import NeoXArgs from megatron.initialize import initialize_megatron from megatron.training import setup_model_and_optimizer _overwrite_values = { "checkpoint_activations": False, "partition_activations": False, "no_load_optim": True, "zero_optimization": None, # disable zero optimization (won't be used in inference, and loading zero optimizer can cause errors) } if overwrite_values: _overwrite_values.update(overwrite_values) neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values) neox_args.configure_distributed_args() neox_args.build_tokenizer() if neox_args.load is None: raise ValueError("`load` parameter must be supplied to load a model`") # initialize megatron initialize_megatron(neox_args) # set up model and load checkpoint. model, _, _ = setup_model_and_optimizer( neox_args=neox_args, use_cache=use_cache, iteration=neox_args.iteration, ) # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed print_rank_0("Finished loading model") model.module.inference_mode(use_cache=use_cache) return model, neox_args
def main(): """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset - Include all args needed for initial model specification Other key args: --block-data-path: path to write to --ict-load or --realm-load: path to checkpoint with which to embed --data-path and --titles-data-path: paths for dataset --indexer-log-interval: reporting interval --indexer-batch-size: size specific for indexer jobs Check README.md for example script """ initialize_megatron( extra_args_provider=None, args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) index_builder = IndexBuilder() index_builder.build_and_save_index()
def __init__(self, num_layers, mp_size, args_others, topo, **kwargs): from megatron.initialize import initialize_megatron args_defaults = { 'vocab_file': get_test_path('gpt2-vocab.json'), 'merge_file': get_test_path('gpt2-merges.txt'), 'tokenizer_type': 'GPT2BPETokenizer', } args_defaults.update(args_others) # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing. sys.argv.extend([ '--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1) ]) initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) from megatron.model.transformer import ParallelTransformerLayer class ParallelTransformerLayerPipe(ParallelTransformerLayer): def forward(self, args): # hardcode attn mask for testing, PP requires the attn_mask to be stashed attention_mask = torch.tensor( [[True]], device=torch.cuda.current_device()) return super().forward(args, attention_mask) layers = [] for x in range(num_layers): layers.append( LayerSpec(ParallelTransformerLayerPipe, self.gpt2_attention_mask_func, self.init_method_normal(0.02), self.scaled_init_method_normal(0.02, num_layers), x)) super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), topology=topo, **kwargs)
def __init__( self, model_name, vocab_file, hidden_size=1024, num_attention_heads=16, num_layers=24, max_seq_length=512, tokenizer_type='BertWordPieceLowerCase', init_method_std=0.02, num_tokentypes=2, ): super().__init__() if not os.path.exists(vocab_file): raise ValueError(f'Vocab file not found at {vocab_file}') megatron_args = { "num_layers": num_layers, "hidden_size": hidden_size, "num_attention_heads": num_attention_heads, "max_position_embeddings": max_seq_length, "tokenizer_type": tokenizer_type, "vocab_file": vocab_file, } initialize_megatron(None, megatron_args, ignore_unknown_args=True) init_method = init_method_normal(init_method_std) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=False, init_method=init_method, scaled_init_method=scaled_init_method_normal( init_method_std, num_layers), ) self.language_model.to(self._device) self._hidden_size = self.language_model.hidden_size
def __init__(self, num_layers, mp_size, args_others, topo, **kwargs): from megatron.initialize import initialize_megatron args_defaults = { 'vocab_file': 'tests/unit/gpt2-vocab.json', 'merge_file': 'tests/unit/gpt2-merges.txt', 'tokenizer_type': 'GPT2BPETokenizer', } args_defaults.update(args_others) # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing. sys.argv.extend([ '--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1) ]) initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) from megatron.model.transformer import ParallelTransformerLayer class ParallelTransformerLayerPipe(ParallelTransformerLayer): def forward(self, args): hidden_states, attention_mask = args[0], args[1] return super().forward(*args), attention_mask layers = [] for x in range(num_layers): layers.append( LayerSpec(ParallelTransformerLayerPipe, self.gpt2_attention_mask_func, self.init_method_normal(0.02), self.scaled_init_method_normal(0.02, num_layers), x)) super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), topology=topo, **kwargs)
def main(): """Main program.""" initialize_megatron(extra_args_provider=add_text_generate_args, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) # Set up model and load checkpoint. model = get_model(model_provider) args = get_args() if args.load is not None: _ = load_checkpoint(model, None, None) # Generate samples. if args.num_samples == 0: args.batch_size = 1 if args.sample_input_file != "": generate_samples_input_from_file(model) else: generate_samples_interactive(model) else: generate_and_write_samples_unconditional(model)
def __init__(self, max_seq_len=DEFAULT_MAX_SEQ_LEN, vocab_path=DEFAULT_VOCAB_PATH, regex=REGEX, default_chem_token_start=DEFAULT_CHEM_TOKEN_START, checkpoints_dir=CHECKPOINTS_DIR, num_layers=DEFAULT_NUM_LAYERS, hidden_size=DEFAULT_D_MODEL, num_attention_heads=DEFAULT_NUM_HEADS, decoder_max_seq_len=None) -> None: super().__init__() torch.set_grad_enabled( False ) # Testing this instead of `with torch.no_grad():` context since it doesn't exit self.device = 'cuda' # Megatron arg loading seems to only work with GPU self.min_jitter_radius = 1.0 self.max_model_position_embeddings = max_seq_len args = { 'num_layers': num_layers, 'hidden_size': hidden_size, 'num_attention_heads': num_attention_heads, 'max_position_embeddings': self.max_model_position_embeddings, 'tokenizer_type': 'GPT2BPETokenizer', 'vocab_file': vocab_path, 'load': checkpoints_dir } with torch.no_grad(): initialize_megatron(args_defaults=args, ignore_unknown_args=True) args = get_args() self.tokenizer = self.load_tokenizer(args.vocab_file, regex, default_chem_token_start) self.model = self.load_model(args, self.tokenizer, decoder_max_seq_len)
default=None, help='Whitespace separated paths or corpora names ' 'for training.') group.add_argument('--valid-data', nargs='*', default=None, help='path(s) to the validation data.') group.add_argument('--overlapping-eval', type=int, default=32, help='Sliding window for overlapping evaluation.') group.add_argument('--strict-lambada', action='store_true', help='Use more difficult formulation of lambada.') return parser if __name__ == '__main__': initialize_megatron(extra_args_provider=get_tasks_args) args = get_args() if args.task in ['LAMBADA', 'WIKITEXT103']: from zeroshot_gpt2.evaluate import main else: raise NotImplementedError('Task {} is not implemented.'.format( args.task)) main()
def pretrain(train_valid_test_dataset_provider, model_provider, forward_step_func, extra_args_provider=None, args_defaults={}): """Main training program. This function will run the followings in the order provided: 1) initialize Megatron. 2) setup model, optimizer and lr schedule using the model_provider. 3) call train_val_test_data_provider to get train/val/test datasets. 4) train the modle using the forward_step_func. Arguments: train_valid_test_dataset_provider: a function that takes the size of train/valid/test dataset and returns `train, valid, test` datasets. model_provider: a function that returns a vanilla version of the model. By vanilla we mean a simple model on cpu with no fp16 or ddp. forward_step_func: a function that takes a `data iterator` and `model`, and returns a `loss` scalar with a dictionary with key:values being the info we would like to monitor during training, for example `lm-loss: value`. We also require that this function add `batch generator` to the timers class. extra_args_provider: a function that takes a parser and adds arguments to it. It is used for programs to add their own arguments. args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. """ # Initalize and get arguments, timers, and Tensorboard writer. initialize_megatron(extra_args_provider=extra_args_provider, args_defaults=args_defaults) args = get_args() timers = get_timers() # Model, optimizer, and learning rate. timers('model and optimizer').start() model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) timers('model and optimizer').stop() # Data stuff. timers('train/valid/test data iterators').start() train_data_iterator, valid_data_iterator, test_data_iterator \ = build_train_valid_test_data_iterators( train_valid_test_dataset_provider) timers('train/valid/test data iterators').stop() # Print setup timing. print_rank_0('done with setups ...') timers.log(['model and optimizer', 'train/valid/test data iterators']) print_rank_0('training ...') iteration = 0 if args.do_train and args.train_iters > 0: iteration = train(forward_step_func, model, optimizer, lr_scheduler, train_data_iterator, valid_data_iterator) if args.do_valid: prefix = 'the end of training for val data' evaluate_and_print_results(prefix, forward_step_func, valid_data_iterator, model, iteration, False) if args.save and iteration != 0: save_checkpoint(iteration, model, optimizer, lr_scheduler) if args.do_test: # Run on test data. prefix = 'the end of training for test data' evaluate_and_print_results(prefix, forward_step_func, test_data_iterator, model, 0, True)
def pretrain(train_valid_test_dataset_provider, model_provider, forward_step_func, extra_args_provider=None, args_defaults={}): """Main training program. This function will run the followings in the order provided: 1) initialize Megatron. 2) setup model, optimizer and lr schedule using the model_provider. 3) call train_val_test_data_provider to get train/val/test datasets. 4) train the modle using the forward_step_func. Arguments: train_valid_test_dataset_provider: a function that takes the size of train/valid/test dataset and returns `train, valid, test` datasets. model_provider: a function that returns a vanilla version of the model. By vanilla we mean a simple model on cpu with no fp16 or ddp. forward_step_func: a function that takes a `data iterator` and `model`, and returns a `loss` scalar with a dictionary with key:values being the info we would like to monitor during training, for example `lm-loss: value`. We also require that this function add `batch generator` to the timers class. extra_args_provider: a function that takes a parser and adds arguments to it. It is used for programs to add their own arguments. args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. """ # Initalize and get arguments, timers, and Tensorboard writer. initialize_megatron(extra_args_provider=extra_args_provider, args_defaults=args_defaults) # Adjust the startup time so it reflects the largest value. # This will be closer to what scheduler will see (outside of # image ... launches. global _TRAIN_START_TIME start_time_tensor = torch.cuda.FloatTensor([_TRAIN_START_TIME]) torch.distributed.all_reduce(start_time_tensor, op=torch.distributed.ReduceOp.MIN) _TRAIN_START_TIME = start_time_tensor.item() print_rank_0('time to initialize megatron (seconds): {:.3f}'.format( time.time() - _TRAIN_START_TIME)) print_datetime('after megatron is initialized') args = get_args() timers = get_timers() # Model, optimizer, and learning rate. timers('model and optimizer').start() model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) timers('model and optimizer').stop() print_datetime('after model, optimizer, and learning rate ' 'scheduler are built') # Data stuff. timers('train/valid/test data iterators').start() train_data_iterator, valid_data_iterator, test_data_iterator \ = build_train_valid_test_data_iterators( train_valid_test_dataset_provider) timers('train/valid/test data iterators').stop() print_datetime('after dataloaders are built') # Print setup timing. print_rank_0('done with setups ...') timers.log(['model and optimizer', 'train/valid/test data iterators']) print_rank_0('training ...') iteration = 0 if args.do_train and args.train_iters > 0: iteration = train(forward_step_func, model, optimizer, lr_scheduler, train_data_iterator, valid_data_iterator) print_datetime('after training is done') if args.do_valid: prefix = 'the end of training for val data' evaluate_and_print_results(prefix, forward_step_func, valid_data_iterator, model, iteration, False) if args.save and iteration != 0: save_checkpoint(iteration, model, optimizer, lr_scheduler) if args.do_test: # Run on test data. prefix = 'the end of training for test data' evaluate_and_print_results(prefix, forward_step_func, test_data_iterator, model, 0, True)
def pretrain(neox_args): """Main training program. This function will run the following in the order provided: 1) initialize Megatron. 2) setup model, optimizer and lr schedule 3) call train_val_test_data_provider to get train/val/test datasets. 4) train the model. Arguments: neox_args: an instance of NeoXArgs containing the configuration for pretrain """ # setup logging and timers init_wandb(neox_args=neox_args) timers = Timers(use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer) # Initialize and get arguments, timers, and Tensorboard writer. initialize_megatron(neox_args=neox_args) # Model, optimizer, and learning rate. timers("model and optimizer").start() model, optimizer, lr_scheduler = setup_model_and_optimizer( neox_args=neox_args, use_cache=False) timers("model and optimizer").stop() # Data stuff. timers("train/valid/test data iterators").start() ( train_data_iterator, valid_data_iterator, test_data_iterator, ) = build_train_valid_test_data_iterators(neox_args=neox_args) timers("train/valid/test data iterators").stop() # Print setup timing. print_rank_0("done with setups ...") timers.log(["model and optimizer", "train/valid/test data iterators"]) print_rank_0("training ...") iteration = 0 if neox_args.do_train and neox_args.train_iters > 0: iteration = train( neox_args=neox_args, timers=timers, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, train_data_iterator=train_data_iterator, valid_data_iterator=valid_data_iterator, ) if neox_args.do_valid: prefix = "the end of training for val data" evaluate_and_print_results( neox_args=neox_args, prefix=prefix, forward_step_func=forward_step, data_iterator=valid_data_iterator, model=model, iteration=iteration, verbose=False, timers=timers, ) if neox_args.save and iteration != 0: save_checkpoint( neox_args=neox_args, iteration=iteration, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, ) if neox_args.do_test: # Run on test data. prefix = "the end of training for test data" evaluate_and_print_results( neox_args=neox_args, prefix=prefix, forward_step_func=forward_step, data_iterator=test_data_iterator, model=model, iteration=0, # iteration 0 in order to always use full test data verbose=True, timers=timers, )
def main(): """Main program.""" drmode = 0 mode = 0 initialize_megatron(extra_args_provider=add_text_generate_args, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) # Set up model and load checkpoint. model = get_model(model_provider) args = get_args() tokenizer = get_tokenizer() if args.load is not None: _ = load_checkpoint(model, None, None) # Generate samples. if drmode == 1: f = open("questions.txt", 'r') if mode == 0: dir = "qa_345M" else: dir = "qa_345M_ip" if drmode == 0: f = open("para.txt", 'r') if mode == 0: dir = "pa_345M" else: dir = "pa_345M_ip" qs = f.readlines() question_list = [] import json for i in qs: question_list.append(i) f.close() fdir = os.listdir() if not (dir in fdir): os.mkdir(dir) import random import jsonlines while True: q = random.choice(question_list) lists = os.listdir(dir) question = q lts = question[:20] + '.jsonl' if (lts in lists): continue #str=generate_token_tensor(str,tokenizer) if mode == 0: output_string = generate_one_text(model, tokenizer, args, question) print(question, output_string) text_dir = dir + "/" already = [] with jsonlines.open(text_dir + question[:20] + '.jsonl', mode='w') as writer: otc = {} otc['question'] = question otc['answer'] = output_string #print(otc) writer.write(otc) else: output_string, output_scores = generate_string( model, tokenizer, args, question) ranklist = np.argsort(output_scores) best_score = output_scores[ranklist[0]] text_dir = dir + "/" already = [] with jsonlines.open(text_dir + question[:20] + '.jsonl', mode='w') as writer: otc = {} otc['question'] = question otc['answer'] = output_string[ranklist[0]] #print(otc) writer.write(otc)
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') group.add_argument("--top_k", type=int, default=0, help='Top k sampling.') group.add_argument("--out-seq-length", type=int, default=1024, help='Size of the output generated text.') return parser if __name__ == "__main__": initialize_megatron(extra_args_provider=add_text_generate_args, args_defaults={ 'tokenizer_type': 'GPT2BPETokenizer', 'no_load_rng': True, 'no_load_optim': True }) args = get_args() if args.num_layers_per_virtual_pipeline_stage is not None: print( "Interleaved pipeline schedule is not yet supported for text generation." ) exit() # Set up model and load checkpoint model = get_model(model_provider, wrap_with_ddp=False) if args.load is not None: _ = load_checkpoint(model, None, None)