def test_neoxargs_consume_deepy_args_with_config_dir(): """ verify consume_deepy_args processes command line arguments including config dir """ from megatron.neox_arguments import NeoXArgs # load neox args with command line with patch( "sys.argv", [ str(get_root_directory() / "deepy.py"), "train.py", "-d", str(get_config_directory()), ] + ["small.yml", "local_setup.yml"], ): args_loaded_consume = NeoXArgs.consume_deepy_args() # load neox args directly from yaml files args_loaded_yamls = NeoXArgs.from_ymls( get_configs_with_path(["small.yml", "local_setup.yml"])) # update values from yaml files that cannot otherwise be matched args_loaded_yamls.update_value("user_script", "train.py") args_loaded_yamls.wandb_group = args_loaded_consume.wandb_group assert args_loaded_yamls == args_loaded_consume
def run_neox_args_load_test(yaml_files): from megatron.neox_arguments import NeoXArgs yaml_list = get_configs_with_path(yaml_files) args_loaded = NeoXArgs.from_ymls(yaml_list) assert isinstance(args_loaded, NeoXArgs) # initialize an empty config dictionary to be filled by yamls config = dict() # iterate of all to be loaded yaml files for conf_file_name in yaml_list: # load file with open(conf_file_name) as conf_file: conf = yaml.load(conf_file, Loader=yaml.FullLoader) # check for key duplicates and load values for conf_key, conf_value in conf.items(): if conf_key in config: raise ValueError( f'Conf file {conf_file_name} has the following duplicate keys with previously loaded file: {conf_key}' ) conf_key_converted = conf_key.replace( "-", "_") # TODO remove replace and update configuration files? config[conf_key_converted] = conf_value # validate that neox args has the same value as specified in the config (if specified in the config) for k, v in config.items(): neox_args_value = getattr(args_loaded, k) assert v == neox_args_value, "loaded neox args value " + str( k) + " == " + str( neox_args_value) + " different from config file " + str(v)
def get_neox_args(context): args = AttrMap(context.get_hparams()) exp_config = context.get_experiment_config() # Gather overrides. overwrite_values = args.pop("overwrite_values") # We are going to overwrite certain neox_args with determined config values # from the experiment config to ensure consistency. assert ("batches" in exp_config["searcher"]["max_length"] ), "Please specify max_length in batches." assert ("batches" in exp_config["min_validation_period"] ), "Please specify min_validation_period in batches." overwrite_values.update({ "train_iters": exp_config["searcher"]["max_length"]["batches"], "save_interval": exp_config["min_validation_period"]["batches"], "eval_interval": exp_config["min_validation_period"]["batches"], "global_num_gpus": context.distributed.get_size(), "seed": context.env.trial_seed, }) for k, v in overwrite_values.items(): logging.info(f"Setting neox_args.{k} to {v}") # Build neox args. neox_args = NeoXArgs.process_parsed_deepy_args( args, overwrite_values=overwrite_values) return neox_args
def setup_for_inference_or_eval(inference=True, get_key_value=True, overwrite_values=None): from megatron.neox_arguments import NeoXArgs from megatron.initialize import initialize_megatron from megatron.training import setup_model_and_optimizer _overwrite_values = { "checkpoint_activations": False, "partition_activations": False, "no_load_optim": True, } if overwrite_values: _overwrite_values.update(overwrite_values) neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values) neox_args.configure_distributed_args() neox_args.build_tokenizer() if neox_args.load is None: raise ValueError("`load` parameter must be supplied to load a model`") # initialize megatron initialize_megatron(neox_args) # set up model and load checkpoint. model, _, _ = setup_model_and_optimizer( neox_args=neox_args, inference=inference, get_key_value=get_key_value ) # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed print_rank_0('Finished loading model') return model, neox_args
def test_neoxargs_consume_neox_args(): """ verify megatron args are correctly consumed after sending via deepspeed """ from megatron.neox_arguments import NeoXArgs # intitially load config from files as would be the case in deepy.py yaml_list = get_configs_with_path(["small.yml", "local_setup.yml"]) args_baseline = NeoXArgs.from_ymls(yaml_list) args_baseline.update_value("user_script", str(get_root_directory() / "pretrain_gpt2.py")) deepspeed_main_args = args_baseline.get_deepspeed_main_args() # patch sys.argv so that args can be access by set_global_variables within initialize_megatron with patch('sys.argv', deepspeed_main_args): args_loaded = NeoXArgs.consume_neox_args() #TODO is the wandb group really to be changed? args_loaded.wandb_group = args_baseline.wandb_group assert args_baseline.megatron_config == args_loaded.megatron_config
def model_setup(yaml_list=None, param_dict=None, clear_data=True, inference=False): from megatron.neox_arguments import NeoXArgs from megatron.mpu import destroy_model_parallel from megatron import initialize_megatron from megatron.training import setup_model_and_optimizer destroy_model_parallel( ) # mpu model parallel contains remaining global vars if clear_data and (not torch.distributed.is_initialized() or torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0): clear_test_dirs() overwrite_values = { "user_script": str(get_root_directory() / "pretrain_gpt2.py"), "save": TEST_CHECKPOINT_DIR, "load": TEST_CHECKPOINT_DIR, "log_dir": TEST_LOG_DIR, "tensorboard_dir": TEST_TENSORBOARD_DIR, } # should not both be none assert yaml_list is not None or param_dict is not None # initially load config from files as would be the case in deepy.py if yaml_list is not None: args_loaded = NeoXArgs.from_ymls(yaml_list, overwrite_values=overwrite_values) else: p_dict = param_dict.copy() p_dict.update(overwrite_values) args_loaded = NeoXArgs.from_dict(p_dict) args_loaded.build_tokenizer() initialize_megatron(neox_args=args_loaded) model, optimizer, lr_scheduler = setup_model_and_optimizer( neox_args=args_loaded, inference=inference, get_key_value=True) return model, optimizer, lr_scheduler, args_loaded
def test_neoxargs_consume_deepy_args_without_yml_suffix(): """ verify consume_deepy_args processes command line arguments without yaml suffix """ from megatron.neox_arguments import NeoXArgs # load neox args with command line with patch('sys.argv', [str(get_root_directory() / "deepy.py"), "pretrain_gpt2.py"] + get_configs_with_path(["small", "local_setup"])): args_loaded_consume = NeoXArgs.consume_deepy_args() # load neox args directly from yaml files args_loaded_yamls = NeoXArgs.from_ymls(get_configs_with_path(["small.yml", "local_setup.yml"])) # update values from yaml files that cannot otherwise be matched args_loaded_yamls.update_value("user_script", "pretrain_gpt2.py") args_loaded_yamls.wandb_group = args_loaded_consume.wandb_group assert args_loaded_yamls == args_loaded_consume
def test_neoxargs_fail_instantiate_without_any_params(): """ verify assertion error if required arguments are not provided """ from megatron.neox_arguments import NeoXArgs try: args_loaded = NeoXArgs() assert False except Exception as e: assert True
def setup_for_inference_or_eval( use_cache=True, overwrite_values=None, ): """ Initializes the model for evaluation or inference (doesn't load optimizer states, etc.) from command line args. use_cache: bool Whether to use key value caching in inference. overwrite_values: dict Optional Values to overwrite in the model config. """ from megatron.neox_arguments import NeoXArgs from megatron.initialize import initialize_megatron from megatron.training import setup_model_and_optimizer _overwrite_values = { "checkpoint_activations": False, "partition_activations": False, "no_load_optim": True, "zero_optimization": None, # disable zero optimization (won't be used in inference, and loading zero optimizer can cause errors) } if overwrite_values: _overwrite_values.update(overwrite_values) neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values) neox_args.configure_distributed_args() neox_args.build_tokenizer() if neox_args.load is None: raise ValueError("`load` parameter must be supplied to load a model`") # initialize megatron initialize_megatron(neox_args) # set up model and load checkpoint. model, _, _ = setup_model_and_optimizer( neox_args=neox_args, use_cache=use_cache, iteration=neox_args.iteration, ) # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed print_rank_0("Finished loading model") model.module.inference_mode(use_cache=use_cache) return model, neox_args
# Copyright (c) 2021, EleutherAI contributors # This file is based on code by the authors denoted below and has been modified from its original version. # # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Pretrain GPT2""" from megatron.neox_arguments import NeoXArgs from megatron.training import pretrain if __name__ == "__main__": neox_args = NeoXArgs.consume_neox_args() neox_args.configure_distributed_args() neox_args.build_tokenizer( ) # tokenizer needs to be build in training in order to set the padding vocab neox_args.initialize_tensorboard_writer( ) # is initialized if tensorboard directory is defined pretrain(neox_args=neox_args)
# limitations under the License. import logging import os import deepspeed from deepspeed.launcher.runner import main logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) from megatron.neox_arguments import NeoXArgs from megatron.utils import get_wandb_api_key neox_args = NeoXArgs.consume_deepy_args() if neox_args.wandb_group is not None: # concat the wandb group name with a uid to make sure it's unique import wandb neox_args.wandb_group += "_" + wandb.util.generate_id() neox_args.print() deepspeed_main_args = neox_args.get_deepspeed_main_args() # Extract wandb API key and inject into worker environments wandb_token = get_wandb_api_key(neox_args=neox_args) if wandb_token is not None: deepspeed.launcher.runner.EXPORT_ENVS.append('WANDB_API_KEY') os.environ['WANDB_API_KEY'] = wandb_token if __name__ == '__main__': main(deepspeed_main_args)