def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int, default=-1, help="local rank passed from distributed launcher") parser.add_argument("-s", "--steps", type=int, default=100, help="quit after this many steps") parser.add_argument("-p", "--pipeline-parallel-size", type=int, default=2, help="pipeline parallelism") parser.add_argument("--backend", type=str, default="nccl", help="distributed backend") parser.add_argument("--seed", type=int, default=0, help="PRNG seed") parser.add_argument("--fp16", type=bool, default=False, help="fp16 run") parser.add_argument("--run_without_ort", type=bool, default=False, help="onlydeepspeed run") parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() return args
def _get_parser(extra_args_provider=None): parser = argparse.ArgumentParser(description='Megatron-LM Arguments', allow_abbrev=False) # Standard arguments. parser = _add_network_size_args(parser) parser = _add_regularization_args(parser) parser = _add_training_args(parser) parser = _add_initialization_args(parser) parser = _add_learning_rate_args(parser) parser = _add_checkpointing_args(parser) parser = _add_mixed_precision_args(parser) parser = _add_distributed_args(parser) parser = _add_validation_args(parser) parser = _add_data_args(parser) parser = _add_autoresume_args(parser) parser = _add_zero_args(parser) parser = _add_activation_checkpoint_args(parser) # Custom arguments. if extra_args_provider is not None: parser = extra_args_provider(parser) # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) return parser
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epochs', default=1, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--model', type=str, default='resnet50', help='model to benchmark') parser.add_argument('--datadir', type=str, required=False, help='Data directory') parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') parser.add_argument('--steps', type=int, required=False, help='Maxium number of training steps') parser.add_argument('--warmup-steps', type=int, default=10, help='Number of initial steps to ignore in average') parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() train(args)
def get_parser(): # command line args parser = argparse.ArgumentParser( description='VAE Set Generation Experiment') parser = deepspeed.add_config_arguments(parser) parser = add_args(parser) return parser
def parse_arguments(): parser = argparse.ArgumentParser(description='CIFAR') # cuda parser.add_argument('--with_cuda', default=False, action='store_true', help='use CPU in case there\'s no GPU support') parser.add_argument('--use_ema', default=False, action='store_true', help='whether use exponential moving average') # train parser.add_argument('-e', '--epochs', default=1, type=int, help='number of total epochs (default: 30)') parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) return parser.parse_args()
def add_argument(): parser = argparse.ArgumentParser(description='enwik8') parser.add_argument('--with_cuda', default=False, action='store_true', help='use CPU in case there\'s no GPU support') parser.add_argument('--use_ema', default=False, action='store_true', help='whether use exponential moving average') parser.add_argument('-b', '--batch_size', default=32, type=int, help='mini-batch size (default: 32)') parser.add_argument('-e', '--epochs', default=30, type=int, help='number of total epochs (default: 30)') parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() return args
def add_argument(): parser = argparse.ArgumentParser(description="CIFAR") # data # cuda parser.add_argument( "--with_cuda", default=False, action="store_true", help="use CPU in case there's no GPU support", ) parser.add_argument( "--use_ema", default=False, action="store_true", help="whether use exponential moving average", ) # train parser.add_argument("-b", "--batch_size", default=32, type=int, help="mini-batch size (default: 32)") parser.add_argument( "-e", "--epochs", default=30, type=int, help="number of total epochs (default: 30)", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local rank passed from distributed launcher", ) # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() return args
def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') parser.add_argument('-s', '--steps', type=int, default=100, help='quit after this many steps') parser.add_argument('-p', '--pipeline-parallel-size', type=int, default=2, help='pipeline parallelism') parser.add_argument('--backend', type=str, default='nccl', help='distributed backend') parser.add_argument('--seed', type=int, default=0, help='PRNG seed') parser.add_argument('--fp16',type=bool,default=False,help='fp16 run') parser.add_argument('--run_without_ort',type=bool,default=False,help='onlydeepspeed run') parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() return args
def get_arguments(): parser = get_argument_parser() # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() return args
def get_args(): parser = argparse.ArgumentParser(description='GPTNeox Deepspeed Training Script') # Include DeepSpeed configuration arguments parser.add_argument('--model', type=str, default="gpt3_small") parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() return args
def get_args(): """Parse all the args.""" parser = argparse.ArgumentParser(description='PyTorch BERT Model') parser = add_model_config_args(parser) parser = add_fp16_config_args(parser) parser = add_training_args(parser) parser = add_evaluation_args(parser) parser = add_text_generate_args(parser) parser = add_data_args(parser) # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() if not args.data_dir: print('WARNING: No data specified') args.cuda = torch.cuda.is_available() args.rank = int(os.getenv('RANK', '0')) args.world_size = int(os.getenv("WORLD_SIZE", '1')) if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'): # We are using (OpenMPI) mpirun for launching distributed data parallel processes local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK')) local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) # Possibly running with Slurm num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1')) nodeid = int(os.getenv('SLURM_NODEID', '0')) args.local_rank = local_rank args.rank = nodeid * local_size + local_rank args.world_size = num_nodes * local_size args.model_parallel_size = min(args.model_parallel_size, args.world_size) if args.rank == 0: print('using world size: {} and model-parallel size: {} '.format( args.world_size, args.model_parallel_size)) args.dynamic_loss_scale = False if args.loss_scale is None: args.dynamic_loss_scale = True if args.rank == 0: print(' > using dynamic loss scaling') # The args fp32_* or fp16_* meant to be active when the # args fp16 is set. So the default behaviour should all # be false. if not args.fp16: args.fp32_embedding = False args.fp32_tokentypes = False args.fp32_layernorm = False return args
def parse_args(): parser = argparse.ArgumentParser(description="Train a detector") parser.add_argument("config", help="train config file path") parser.add_argument("--work-dir", help="the dir to save logs and models") parser.add_argument("--resume-from", help="the checkpoint file to resume from") parser.add_argument( "--no-validate", action="store_true", help="whether not to evaluate the checkpoint during training", ) group_gpus = parser.add_mutually_exclusive_group() group_gpus.add_argument( "--gpus", type=int, help="number of gpus to use " "(only applicable to non-distributed training)", ) group_gpus.add_argument( "--gpu-ids", type=int, nargs="+", help="ids of gpus to use " "(only applicable to non-distributed training)", ) parser.add_argument("--seed", type=int, default=0, help="random seed") parser.add_argument( "--deterministic", action="store_true", help="whether to set deterministic options for CUDNN backend.", ) parser.add_argument("--options", nargs="+", action=DictAction, help="arguments in dict") parser.add_argument( "--launcher", choices=["none", "pytorch", "slurm", "mpi"], default="none", help="job launcher", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--autoscale-lr", action="store_true", help="automatically scale lr with the number of gpus", ) # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() if "LOCAL_RANK" not in os.environ: os.environ["LOCAL_RANK"] = str(args.local_rank) return args
def get_arguments(): parser = get_argument_parser() # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() # no cuda mode is not supported args.no_cuda = False return args
def add_argument(args, extended_parser): # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(extended_parser) new_args, _ = parser.parse_known_args() new_args.deepspeed_config = args.config_file new_args.deepspeed = args.deepspeed_flag print(f"new args={new_args}") return new_args
def test_no_ds_arguments(): parser = basic_parser() parser = deepspeed.add_config_arguments(parser) args = parser.parse_args(['--num_epochs', '2']) assert args assert hasattr(args, 'num_epochs') assert args.num_epochs == 2 assert hasattr(args, 'deepspeed') assert args.deepspeed == False assert hasattr(args, 'deepspeed_config') assert args.deepspeed_config == None
def build_deepspeed_args(deepspeed_config_path: str, local_rank: int = 0): from argparse import ArgumentParser, Namespace parser = ArgumentParser() parser.add_argument('--local_rank', type=int, default=local_rank) parser = deepspeed.add_config_arguments(parser) args, _ = parser.parse_known_args() arg_dict = vars(args) arg_dict.update( dict(deepspeed_config=deepspeed_config_path, deepspeed=True, local_rank=local_rank)) return Namespace(**arg_dict)
def test_no_ds_enable_argument(): parser = basic_parser() parser = deepspeed.add_config_arguments(parser) args = parser.parse_args( ['--num_epochs', '2', '--deepspeed_config', 'foo.json']) assert args assert hasattr(args, 'num_epochs') assert args.num_epochs == 2 assert hasattr(args, 'deepspeed') assert args.deepspeed == False assert hasattr(args, 'deepspeed_config') assert type(args.deepspeed_config) == str assert args.deepspeed_config == 'foo.json'
def test_core_deepscale_arguments(): parser = basic_parser() parser = deepspeed.add_config_arguments(parser) args = parser.parse_args( ['--num_epochs', '2', '--deepspeed', '--deepspeed_config', 'foo.json']) assert args assert hasattr(args, 'num_epochs') assert args.num_epochs == 2 assert hasattr(args, 'deepspeed') assert type(args.deepspeed) == bool assert args.deepspeed == True assert hasattr(args, 'deepspeed_config') assert type(args.deepspeed_config) == str assert args.deepspeed_config == 'foo.json'
def add_argument(): parser = argparse.ArgumentParser(description='NSMC KoELECTRA') # train parser.add_argument('-e', '--epochs', default=5, type=int, help='number of total epochs (default: 5)') parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() return args
def wrap_arg_parser(parser): """Add arguments to support optional DeepSpeed usage.""" if not has_deepspeed(): parser.add_argument( '--deepspeed', type=lambda _: False, help="whether to use DeepSpeed (ignored since it's not available)", ) else: parser = deepspeed.add_config_arguments(parser) parser.add_argument( '--local_rank', type=int, required=False, default=-1, help='local rank passed from distributed launcher', ) return parser
def add_argument(): """ https://www.deepspeed.ai/tutorials/cifar-10/ """ parser = argparse.ArgumentParser(description='CIFAR') # data # cuda parser.add_argument('--with_cuda', default=False, action='store_true', help='use CPU in case there\'s no GPU support') parser.add_argument('--use_ema', default=False, action='store_true', help='whether use exponential moving average') # train parser.add_argument('-b', '--batch_size', default=512, type=int, help='mini-batch size (default: 32)') parser.add_argument('-e', '--epochs', default=30, type=int, help='number of total epochs (default: 30)') parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) return parser.parse_args()
def get_args(): parser = argparse.ArgumentParser(description='CIFAR') parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') parser.add_argument('-s', '--steps', type=int, default=100, help='quit after this many steps') parser.add_argument('-p', '--pipeline-parallel-size', type=int, default=2, help='pipeline parallelism') parser.add_argument('--backend', type=str, default='nccl', help='distributed backend') parser.add_argument('--seed', type=int, default=1138, help='PRNG seed') parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() return args
def parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False): """Parse all arguments.""" parser = argparse.ArgumentParser(description='Megatron-LM Arguments', allow_abbrev=False) # Standard arguments. parser = _add_network_size_args(parser) parser = _add_regularization_args(parser) parser = _add_training_args(parser) parser = _add_initialization_args(parser) parser = _add_learning_rate_args(parser) parser = _add_checkpointing_args(parser) parser = _add_mixed_precision_args(parser) parser = _add_distributed_args(parser) parser = _add_validation_args(parser) parser = _add_data_args(parser) parser = _add_autoresume_args(parser) parser = _add_realm_args(parser) parser = _add_zero_args(parser) parser = _add_activation_checkpoint_args(parser) # Custom arguments. if extra_args_provider is not None: parser = extra_args_provider(parser) # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) # Parse. if ignore_unknown_args: args, _ = parser.parse_known_args() else: args = parser.parse_args() # Distributed args. args.rank = int(os.getenv('RANK', '0')) args.world_size = int(os.getenv("WORLD_SIZE", '1')) args.model_parallel_size = min(args.model_parallel_size, args.world_size) if args.rank == 0: print('using world size: {} and model-parallel size: {} '.format( args.world_size, args.model_parallel_size)) # Fp16 loss scaling. args.dynamic_loss_scale = False if args.loss_scale is None: args.dynamic_loss_scale = True # Parameters dtype. args.params_dtype = torch.float if args.fp16: args.params_dtype = torch.half if args.rank == 0: print('using {} for parameters ...'.format(args.params_dtype), flush=True) # Set input defaults. for key in defaults: # For default to be valid, it should not be provided in the # arguments that are passed to the program. We check this by # ensuring the arg is set to None. if getattr(args, key) is not None: if args.rank == 0: print('WARNING: overriding default arguments for {key}:{v} \ with {key}:{v2}'.format(key=key, v=defaults[key], v2=getattr(args, key)), flush=True) else: setattr(args, key, defaults[key]) # Check required arguments. required_args = [ 'num_layers', 'hidden_size', 'num_attention_heads', 'max_position_embeddings' ] for req_arg in required_args: _check_arg_is_not_none(args, req_arg) # Checks. assert args.hidden_size % args.num_attention_heads == 0 if args.seq_length is not None: assert args.max_position_embeddings >= args.seq_length if args.lr is not None: assert args.min_lr <= args.lr if args.save is not None: assert args.save_interval is not None # Parameters sharing does not work with torch DDP. if (args.num_unique_layers is not None) and (args.num_layers is not None): assert args.num_unique_layers <= args.num_layers assert args.num_layers % args.num_unique_layers == 0, \ 'num-layers should be divisible by num-unique-layers.' if args.num_unique_layers < args.num_layers: assert args.DDP_impl == 'local', \ 'torch-DDP does not work with parameters sharing.' # Mixed precision checks. if args.fp16_lm_cross_entropy: assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.' # Activation checkpointing. if args.distribute_checkpointed_activations: assert args.checkpoint_activations, \ 'for distribute-checkpointed-activations to work you '\ 'need to enable checkpoint-activations' # load scaled_upper_triang_masked_softmax_fusion kernel if args.scaled_upper_triang_masked_softmax_fusion: fused_kernels.load_scaled_upper_triang_masked_softmax_fusion_kernel() # load scaled_masked_softmax_fusion kernel if args.scaled_masked_softmax_fusion: fused_kernels.load_scaled_masked_softmax_fusion_kernel() _print_args(args) return args
def add_argument(): parser = argparse.ArgumentParser( description='Train Transformer Model for Genome to SMILE translation.') parser.add_argument('--with_cuda', default=False, action='store_true', help='use CPU in case there\'s no GPU support') parser.add_argument('--use_ema', default=False, action='store_true', help='whether use exponential moving average') parser.add_argument('-e', '--epochs', default=10, type=int, help='number of total epochs (default: 30)') parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') parser.add_argument('--ff_chunks', type=int, default=100, help='Reduce memory by chunking') # 3200 parser.add_argument('--attn_chunks', type=int, default=1, help='reduce memory by chunking attention') # 128 parser.add_argument('--dim', type=int, default=1024, help='hidden layers dimension') # 128 parser.add_argument('--emb_dim', type=int, default=128, help='input embedding dimension') # 64 parser.add_argument('--bucket_size', type=int, default=64, help='Bucket size for hashing') # 8 parser.add_argument('--depth', type=int, default=12, help='number of hidden layers') # 12 parser.add_argument('--validate_every', type=int, default=10, help='Frequency of validation') # 12 parser.add_argument('--save_every', type=int, default=10, help='Frequency of saving checkpoint') # 12 parser.add_argument( '--output_folder', type=str, default='./training_output', help='Output folder where to store the training output') # 12 parser.add_argument('--path_to_file_tr', default='./gen_to_mol_tr.csv', help='Trainig file') parser.add_argument('--path_to_file_ts', default='./gen_to_mol_ts.csv', help='Testing file') parser.add_argument('--ds_conf', default='./ds_config.json', help='DeepSpeed configuration file') parser.add_argument('--max_len_gen', type=int, default=32768, help='Max nucleotides per genome.') parser.add_argument('--min_len_gen', type=int, default=-1, help='Min nucleotides per genome') parser.add_argument('--max_len_mol', type=int, default=2048, help='Max symbols for Canonical SMILES.') parser.add_argument('--num_examples_tr', type=int, default=1024, help='Max number of samples TR') parser.add_argument('--num_examples_ts', type=int, default=1024, help='Max number of samples TS') #parser.add_argument('--train_batch_size', type=int,default=8, help='Batch size') parser.add_argument('--heads', type=int, default=8, help='Heads') parser.add_argument( '--n_hashes', type=int, default=4, help= 'Number of hashes - 4 is permissible per author, 8 is the best but slower' ) parser.add_argument( '--use_encdec_v2', default=False, action='store_true', help= 'Use the V2 of the EncDec architecture wrapped by Philip Wang (lucidrain on github)' ) parser.add_argument( '--use_full_attn', default=False, action='store_true', help= 'Only turn on this flag to override and turn on full attention for all sequence lengths.' ) parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() return args
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") #parser.add_argument( # '--deepscale', # default=False, # action='store_true', # help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument("--model_file", type=str, default="0", help="Path to the Pretrained BERT Encoder File.") parser.add_argument('--random', default=False, action='store_true', help="Whether to fientune for random initialization") parser.add_argument('--focal', default=False, action='store_true', help="Whether to use Focal Loss for finetuning.") parser.add_argument('--gamma', type=float, default=0.5, help="Gamma parameter to be used in focal loss.") parser.add_argument('--deepspeed_sparse_attention', default=False, action='store_true', help='Use DeepSpeed sparse self attention.') parser.add_argument( '--preln', action='store_true', default=False, help= "Switching to the variant of Transformer blocks that use pre-LayerNorm." ) parser.add_argument('--deepspeed_transformer_kernel', default=False, action='store_true', help='Use DeepSpeed transformer kernel to accelerate.') parser.add_argument( '--progressive_layer_drop', default=False, action='store_true', help="Whether to enable progressive layer dropping or not") parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps args.seed = random.randint(1, 1000) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if (torch.distributed.get_rank() == 0): # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError( # "Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) torch.distributed.barrier() task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) bert_base_model_config = { "vocab_size_or_config_json_file": 119547, "hidden_size": 1024, "num_hidden_layers": 24, "num_attention_heads": 16, "intermediate_size": 4096, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 512, "type_vocab_size": 2, "initializer_range": 0.02 } if args.progressive_layer_drop: print("BertBaseConfigPreLnLayerDrop") from nvidia.modelingpreln_layerdrop import BertForSequenceClassification, BertConfig, BertLayer elif args.preln: from nvidia.modelingpreln import BertForSequenceClassification, BertConfig, BertLayer else: from nvidia.modeling import BertForSequenceClassification, BertConfig, BertLayer bert_config = BertConfig(**bert_base_model_config) bert_config.vocab_size = len(tokenizer.vocab) # Padding for divisibility by 8 if bert_config.vocab_size % 8 != 0: bert_config.vocab_size += 8 - (bert_config.vocab_size % 8) model = BertForSequenceClassification(args, bert_config, num_labels=num_labels) if args.model_file is not "0": logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}") # bert_state_dict = torch.load(args.model_file) # model.bert.load_state_dict(bert_state_dict) checkpoint_state_dict = torch.load(args.model_file, map_location=torch.device("cpu")) if 'module' in checkpoint_state_dict: logger.info('Loading DeepSpeed v2.0 style checkpoint') model.load_state_dict(checkpoint_state_dict['module'], strict=False) elif 'model_state_dict' in checkpoint_state_dict: model.load_state_dict(checkpoint_state_dict['model_state_dict'], strict=False) else: raise ValueError("Unable to find model state in checkpoint") logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}") if args.random: logger.info("USING RANDOM INITIALISATION FOR FINETUNING") model.apply(model.init_bert_weights) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: if args.deepscale: print("Enabling DeepScale") from deepscale.distributed_apex import DistributedDataParallel as DDP else: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Patch model with deepspeed transformer kernel if not args.deepspeed_transformer_kernel: from deepspeed import replace_transformer_layer model = deepspeed.module_inject.replace_transformer_layer( orig_layer_impl=BertLayer, model=model, micro_batch_size=args.train_batch_size, bert_config=bert_config, seed=args.seed, preln=arg.preln, fp16=args.fp16, huggingface=False) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] model, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=optimizer_grouped_parameters, dist_init_required=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": if args.fp16: all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.half) else: all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() nb_tr_examples = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": if args.focal: loss_fct = FocalLoss(class_num=num_labels, gamma=args.gamma) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.deepscale and args.local_rank != -1: model.disable_need_reduction() if (step + 1) % args.gradient_accumulation_steps == 0: model.enable_need_reduction() if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * \ warmup_linear( global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 saved_path = os.path.join(args.output_dir, "finetuned_quantized_checkpoints") checkpoint_model(PATH=saved_path, ckpt_id='epoch{}_step{}'.format( args.num_train_epochs, global_step), model=model, epoch=args.num_train_epochs, last_global_step=global_step, last_global_data_samples=nb_tr_examples * torch.distributed.get_world_size()) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": if args.focal: loss_fct = FocalLoss(class_num=num_labels, gamma=args.gamma) else: loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() print(logits.type()) print(label_ids.type()) if task_name == "sts-b": tmp_eval_loss = loss_fct(logits.float().view(-1), label_ids.view(-1)) else: tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}{}) already exists and is not empty.". format(args.output_dir, '-MM')) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) if args.focal: loss_fct = FocalLoss(class_num=num_labels, gamma=args.gamma) else: loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
default='./ckpt_dir', type=str, dest='ckpt_dir', help='directory to save and load checkpoints to') parser.add_argument('--log_dir', default='./ckpt_dir', type=str, dest='ckpt_dir', help='directory to save and load checkpoints to') parser.add_argument('--ckpt_id', type=int, dest='ckpt_id', help='The ckpt you wish to continue from.') # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() if args.with_cuda: assert torch.cuda.is_available() trainer = ReformerTrainer(dataset, model, tokenizer, train_batch_size=args.batch_size, eval_batch_size=args.batch_size, tb_writer=False) train_dataset, eval_dataset = trainer.split_datasets(train_test_split=0.1) train_dataloader, eval_dataloader = trainer.build_dataloaders( train_dataset, eval_dataset)
def get_args(): """Parse all the args.""" parser = argparse.ArgumentParser(description='PyTorch BERT Model') parser = add_model_config_args(parser) parser = add_fp16_config_args(parser) parser = add_training_args(parser) parser = add_evaluation_args(parser) parser = add_text_generate_args(parser) parser = add_data_args(parser) # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() if not args.train_data and not args.train_data_path: print('WARNING: No training data specified') args.cuda = torch.cuda.is_available() args.rank = int(os.getenv('RANK', '0')) args.world_size = int(os.getenv("WORLD_SIZE", '1')) if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'): # We are using (OpenMPI) mpirun for launching distributed data parallel processes local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK')) local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) # Possibly running with Slurm num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1')) nodeid = int(os.getenv('SLURM_NODEID', '0')) args.local_rank = local_rank args.rank = nodeid * local_size + local_rank args.world_size = num_nodes * local_size args.model_parallel_size = min(args.model_parallel_size, args.world_size) if args.rank == 0: print('using world size: {} and model-parallel size: {} '.format( args.world_size, args.model_parallel_size)) args.dynamic_loss_scale = False if args.loss_scale is None: args.dynamic_loss_scale = True if args.rank == 0: print(' > using dynamic loss scaling') # The args fp32_* or fp16_* meant to be active when the # args fp16 is set. So the default behaviour should all # be false. if not args.fp16: args.fp32_embedding = False args.fp32_tokentypes = False args.fp32_layernorm = False if hasattr(args, "deepspeed" ) and args.deepspeed and args.deepspeed_config is not None: with open(args.deepspeed_config) as file: deepspeed_config = json.load(file) if "train_micro_batch_size_per_gpu" in deepspeed_config: args.batch_size = deepspeed_config[ "train_micro_batch_size_per_gpu"] if "gradient_accumulation_steps" in deepspeed_config: args.gradient_accumulation_steps = deepspeed_config[ "gradient_accumulation_steps"] else: args.gradient_accumulation_steps = None if "optimizer" in deepspeed_config: optimizer_params_config = deepspeed_config["optimizer"].get( "params", {}) args.lr = optimizer_params_config.get("lr", args.lr) args.weight_decay = optimizer_params_config.get( "weight_decay", args.weight_decay) return args
def main(): parser = argparse.ArgumentParser( description='Simple Resnet training examaple') parser.add_argument('images', type=Path) parser.add_argument('--pipeline-batch', default=False, action='store_true') parser.add_argument('--device', default='cpu') parser.add_argument('--pin-memory', default=False, action='store_true') parser.add_argument('--num-workers', type=int) parser.add_argument('--prefetch-factor', type=int) parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() rng = np.random.default_rng(1729) if args.device == 'cpu': device = torch.device('cpu') else: if not torch.cuda.is_available(): raise RuntimeError( f"CUDA not available and device set to {args.device}") else: device = torch.device(args.device) torch.backends.cudnn.benchmark = True print(f"Device is set to {device}") train_set, dev_set, test_set = make_datasets(args.images, rng=rng) batch_size = 8 max_epochs = 1 dataloader_kwargs = dict() if args.pin_memory: dataloader_kwargs['pin_memory'] = True if args.num_workers: dataloader_kwargs['num_workers'] = args.num_workers if args.prefetch_factor: dataloader_kwargs['prefetch_factor'] = args.num_workers model = resnet152(pretrained=False, num_classes=train_set.num_classes) #model = resnet18(pretrained=False, num_classes=train_set.num_classes) #model = LogisticRegression(train_set[0][0].shape, train_set.num_classes) #model = alexnet(pretrained=False, num_classes=train_set.num_classes) model_engine, optimizer, _, __ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) training_loader = model_engine.deepspeed_io(train_set, **dataloader_kwargs) dev_loader = model_engine.deepspeed_io(dev_set, **dataloader_kwargs) test_loader = model_engine.deepspeed_io(test_set, **dataloader_kwargs) loss_fn = nn.CrossEntropyLoss() #optimizer = AdamW(model.parameters(), lr=1e-4) for epoch in range(max_epochs): training_losses = [] model_engine.train() for x, y in tqdm(training_loader, desc='training progress', total=len(training_loader)): #optimizer.zero_grad() prediction = model_engine(x.to(model_engine.local_rank)) loss = loss_fn(prediction, y.to(model_engine.local_rank)) #loss.backward() #optimizer.step() model_engine.backward(loss) model_engine.step() training_losses.append(loss.item()) print(f'Training loss: {np.mean(training_losses)}') val_losses = [] model_engine.eval() with torch.no_grad(): for x, y in dev_loader: prediction = model(x.to(model_engine.local_rank)) loss = loss_fn(prediction, y.to(model_engine.local_rank)) val_losses.append(loss.item()) print(f'Validation loss: {np.mean(val_losses)}') test_match = [] model_engine.eval() with torch.no_grad(): for x, y in test_loader: prediction = model(x.to(model_engine.local_rank)) correct = torch.argmax(prediction, dim=-1) == y.to(model_engine.local_rank) test_match.extend(correct.cpu().tolist()) print(f'Test accuracy: {np.mean(test_match)}')
def main(): parser = get_argument_parser() # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) # Prepare Summary writer if torch.distributed.get_rank() == 0 and args.job_name is not None: args.summary_writer = get_summary_writer(name=args.job_name, base=args.output_dir) else: args.summary_writer = None tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # model = BertForQuestionAnswering.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) # Support for word embedding padding checkpoints # Prepare model bert_model_config = { "vocab_size_or_config_json_file": 119547, "hidden_size": 1024, "num_hidden_layers": 24, "num_attention_heads": 16, "intermediate_size": 4096, "hidden_act": "gelu", "hidden_dropout_prob": args.dropout, "attention_probs_dropout_prob": args.dropout, "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 512, "type_vocab_size": 2, "initializer_range": 0.02 } if args.preln: bert_config = BertConfigPreLN(**bert_model_config) else: bert_config = BertConfig(**bert_model_config) bert_config.vocab_size = len(tokenizer.vocab) # Padding for divisibility by 8 if bert_config.vocab_size % 8 != 0: bert_config.vocab_size += 8 - (bert_config.vocab_size % 8) if args.preln: model = BertForQuestionAnsweringPreLN(bert_config, args) else: model = BertForQuestionAnswering(bert_config, args) print("VOCAB SIZE:", bert_config.vocab_size) if args.model_file is not "0": logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}") checkpoint_state_dict = torch.load(args.model_file, map_location=torch.device("cpu")) if 'module' in checkpoint_state_dict: logger.info('Loading DeepSpeed v2.0 style checkpoint') model.load_state_dict(checkpoint_state_dict['module'], strict=False) elif 'model_state_dict' in checkpoint_state_dict: model.load_state_dict(checkpoint_state_dict['model_state_dict'], strict=False) else: raise ValueError("Unable to find model state in checkpoint") #bert_state_dict = torch.load(args.model_file) #model.bert.load_state_dict(bert_state_dict, strict=False) logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}") # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] model, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=optimizer_grouped_parameters, dist_init_required=False) logger.info("propagate deepspeed-config settings to client settings") args.train_batch_size = model.train_micro_batch_size_per_gpu() args.gradient_accumulation_steps = model.gradient_accumulation_steps() args.fp16 = model.fp16_enabled() args.print_steps = model.steps_per_print() args.learning_rate = model.get_lr()[0] args.wall_clock_breakdown = model.wall_clock_breakdown() t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() ema_loss = 0. sample_count = 0 num_epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): num_epoch += 1 epoch_step = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", smoothing=0)): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps ema_loss = args.loss_plot_alpha * ema_loss + ( 1 - args.loss_plot_alpha) * loss.item() model.backward(loss) sample_count += (args.train_batch_size * torch.distributed.get_world_size()) if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step model.step() global_step += 1 epoch_step += 1 if torch.distributed.get_rank( ) == 0 and args.summary_writer: summary_events = [ (f'Train/Steps/lr', lr_this_step, global_step), (f'Train/Samples/train_loss', loss.item(), sample_count), (f'Train/Samples/lr', lr_this_step, sample_count), (f'Train/Samples/train_ema_loss', ema_loss, sample_count) ] if args.fp16 and hasattr(optimizer, 'cur_scale'): summary_events.append( (f'Train/Samples/scale', optimizer.cur_scale, sample_count)) write_summary_events(args.summary_writer, summary_events) args.summary_writer.flush() if torch.distributed.get_rank() == 0 and ( step + 1) % args.print_steps == 0: logger.info( f"bert_squad_progress: step={global_step} lr={lr_this_step} loss={ema_loss}" ) else: model.step() if is_time_to_exit(args=args, epoch_steps=epoch_step, global_steps=global_step): logger.info( f'Warning: Early epoch termination due to max steps limit, epoch step ={epoch_step}, global step = {global_step}, epoch = {num_epoch}' ) break # Save a trained model # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self #output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # if args.do_train: # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned #model_state_dict = torch.load(output_model_file) #model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) # model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def get_args(): parser = argparse.ArgumentParser() parser = deepspeed.add_config_arguments(parser) group = parser.add_argument_group(title='input data') group.add_argument('--input', type=str, required=True, help='Path to input JSON') group.add_argument( '--json-keys', nargs='+', default=['text'], help='space separate listed of keys to extract from json') group.add_argument('--split-sentences', action='store_true', help='Split documents into sentences.') group.add_argument('--keep-newlines', action='store_true', help='Keep newlines between sentences when splitting.') group = parser.add_argument_group(title='tokenizer') group.add_argument('--tokenizer-type', type=str, required=True, choices=[ 'BertWordPieceLowerCase', 'BertWordPieceCase', 'GPT2BPETokenizer' ], help='What type of tokenizer to use.') group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file') group.add_argument('--merge-file', type=str, default=None, help='Path to the BPE merge file (if necessary).') group.add_argument('--append-eod', action='store_true', help='Append an <eod> token to the end of a document.') group = parser.add_argument_group(title='output data') #group.add_argument('--output-prefix', type=str, required=True, # help='Path to binary output file without suffix') group.add_argument('--dataset-impl', type=str, default='mmap', choices=['lazy', 'cached', 'mmap']) group = parser.add_argument_group(title='runtime') group.add_argument('--workers', type=int, default=1, help='Number of worker processes to launch') group.add_argument('--log-interval', type=int, default=100, help='Interval between progress updates') args = parser.parse_args() args.keep_empty = False if args.tokenizer_type.lower().startswith('bert'): if not args.split_sentences: print( "Bert tokenizer detected, are you sure you don't want to split sentences?" ) # some default/dummy values for the tokenizer args.rank = 0 args.make_vocab_size_divisible_by = 128 args.model_parallel_size = 1 return args