def prepare_model_optimizer(args): # Loading Model model = BertMultiTask(args) # Optimizer parameters optimizer_grouped_parameters = prepare_optimizer_parameters(args, model) # DeepSpeed initializer handles FP16, distributed, optimizer automatically. model.network, optimizer, _, _ = deepspeed.initialize( args=args, model=model.network, model_parameters=optimizer_grouped_parameters) # Overwrite application configs with DeepSpeed config args.train_micro_batch_size_per_gpu = model.network.train_micro_batch_size_per_gpu( ) args.gradient_accumulation_steps = model.network.gradient_accumulation_steps( ) # Set DeepSpeed info args.local_rank = model.network.local_rank args.device = model.network.device model.set_device(args.device) args.fp16 = model.network.fp16_enabled() args.use_lamb = model.network.optimizer_name( ) == deepspeed.pt.deepspeed_config.LAMB_OPTIMIZER # Prepare Summary Writer and saved_models path if dist.get_rank() == 0: summary_writer = get_sample_writer(name=args.job_name, base=args.output_dir) args.summary_writer = summary_writer os.makedirs(args.saved_model_path, exist_ok=True) return model, optimizer
def prepare_model_optimizer(args): # Loading Model model = BertMultiTask(args) # Optimizer parameters optimizer_grouped_parameters = prepare_optimizer_parameters(args, model) # DeepSpeed initializer handles FP16, distributed, optimizer automatically. model.network, optimizer, _, _ = deepspeed.initialize( args=args, model=model.network, model_parameters=optimizer_grouped_parameters, dist_init_required=False) # Overwrite application configs with DeepSpeed config args.train_batch_size = model.network.train_micro_batch_size_per_gpu() args.gradient_accumulation_steps = model.network.gradient_accumulation_steps( ) return model, optimizer
def prepare_model_optimizer(args): # Loading Model model = BertMultiTask(args) if args.fp16: model.half() model.to(args.device) # Optimizer parameters optimizer_grouped_parameters = prepare_optimizer_parameters(args, model) # Prepare Optimizer config = args.config logger = args.logger if args.fp16: try: from apex.optimizers import FP16_Optimizer, FP16_UnfusedOptimizer, FusedAdam, FusedLamb except: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) if args.use_lamb: logger.info( "Using Lamb optimizer min_coeff={}, max_coeff={}".format( args.min_lamb, args.max_lamb)) optimizer = FusedLamb(optimizer_grouped_parameters, lr=config["training"]["learning_rate"], bias_correction=False, max_grad_norm=1.0, max_coeff=args.max_lamb, min_coeff=args.min_lamb) else: logger.info("Using adam optimizer") optimizer = FusedAdam(optimizer_grouped_parameters, lr=config["training"]["learning_rate"], bias_correction=False, max_grad_norm=1.0) logger.info(f"unwrapped optimizer_state = {optimizer.state_dict()}") if args.use_lamb: optimizer = FP16_UnfusedOptimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = BertAdam( optimizer_grouped_parameters, lr=config["training"]["learning_rate"], warmup=config["training"]["warmup_proportion"], t_total=config["training"]["total_training_steps"]) if args.local_rank != -1: try: logger.info( "***** Using Default Apex Distributed Data Parallel *****") from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) torch.cuda.set_device(args.local_rank) model.network = DDP(model.network, delay_allreduce=args.delay_allreduce, message_size=250000000) elif args.n_gpu > 1: model.network = DDP(model.network, delay_allreduce=args.delay_allreduce, message_size=250000000) return model, optimizer