args.lr_schedule, args.learning_rate, args.warmup_steps, args.warmup_proportion, config.n_embd, args.num_optim_steps) if args.local_rank != -1: grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) optimizer.step() optimizer.zero_grad() global_step += 1 # Print log info to file if args.local_rank != -1: mean_loss = sum(all_gather_list(mean_loss)) / get_world_size() mean_ppl = sum(all_gather_list(mean_ppl)) / get_world_size() n_token_real_all_proc = sum(all_gather_list(n_token_real)) n_token_total_all_proc = sum(all_gather_list(n_token_total)) else: n_token_real_all_proc = n_token_real n_token_total_all_proc = n_token_total if args.local_rank == -1 or get_rank() == 0: epoch_time = time.time() - train_start_time_epoch if pbar is not None: pbar.set_postfix_str( f"tok/s: {n_token_real_all_proc//epoch_time//1000}k " f"ppl: {mean_ppl:.2f} epoch: {epoch}") pbar.update(1) with open(join(log_dir, 'train_log.txt'), 'a+', buffering=1) as train_logger:
args.warmup_proportion, config.n_embd, args.num_optim_steps) if args.local_rank != -1: grads = [ p.grad.data for p in coordinator.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) optimizer.step() optimizer.zero_grad() global_step += 1 # Print log info to file if args.local_rank != -1: mean_loss = sum(all_gather_list(mean_loss)) / get_world_size() mean_sl_loss = sum( all_gather_list(mean_sl_loss)) / get_world_size() n_token_real_all_proc = sum(all_gather_list(n_token_real)) n_token_total_all_proc = sum(all_gather_list(n_token_total)) n_samples_total_all_proc = sum(all_gather_list(nb_tr_examples)) else: n_token_real_all_proc = n_token_real n_token_total_all_proc = n_token_total n_samples_total_all_proc = nb_tr_examples if args.local_rank == -1 or get_rank() == 0: epoch_time = time.time() - train_start_time_epoch if pbar is not None: pbar.set_postfix_str( f"samples/s: {n_samples_total_all_proc/epoch_time:.5f} "