"to use distributed and fp16 training.") # https://nvidia.github.io/apex/amp.html optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True, # verbose=False) # else: # optimizer = FP16_Optimizer(optimizer, # static_loss_scale=args.loss_scale, # verbose=False) else: optimizer = Adam(optimizer_grouped_parameters, args.learning_rate) ######################################################################### # Training ! ########################################################################## if args.local_rank == -1: train_logger = open(join(log_dir, 'train_log.txt'), 'a+', buffering=1) eval_logger = open(join(log_dir, 'eval_log.txt'), 'a+', buffering=1) print( 'epoch,global_step,step,mean_loss,mean_ppl,n_token_real,' 'n_token_total,epoch_time', file=train_logger) print('epoch,global_step,step,eval_loss,eval_ppl', file=eval_logger) global_step = 0
optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True, verbose=False) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale, verbose=False) else: optimizer = Adam(optimizer_grouped_parameters, args.learning_rate, max_grad_norm=1.0) ######################################################################### # Training ! ########################################################################## if args.local_rank == -1 or get_rank() == 0: train_logger = open(join(log_dir, 'train_log.txt'), 'a+', buffering=1) eval_logger = open(join(log_dir, 'eval_log.txt'), 'a+', buffering=1) print( 'epoch,global_step,step,mean_loss,mean_ppl,n_token_real,' 'n_token_total,epoch_time', file=train_logger) print('epoch,global_step,step,eval_loss,eval_ppl', file=eval_logger)