def metrics_func(model, epoch, output_predictions=False): print_rank_last('calculating metrics ...') correct = 0 total = 0 if output_predictions: assert mpu.get_data_parallel_world_size() == 1 named_predictions = [] names = 'predictions' for name, dataloader in dataloaders: output = calculate_correct_answers(name, model, dataloader, epoch, output_predictions) if not output_predictions: correct_ans, total_count = output else: correct_ans, total_count, predictions = output named_predictions.append((name, predictions)) names += '_' + name correct += correct_ans total += total_count if is_last_rank(): percent = float(correct) * 100.0 / float(total) print(' >> |epoch: {}| overall: correct / total = {} / {} = ' '{:.4f} %'.format(epoch, correct, total, percent)) if output_predictions and is_last_rank(): assert args.load is not None filename = os.path.join(args.load, names + '.pt') torch.save(named_predictions, filename)
def evaluate_and_print_results(task, data_loader, model, eval_metric): """Evaluate and print results on screen.""" # Evaluate and get results. output = evaluate(data_loader, model, eval_metric) string = ' validation results on {} | '.format(task) if is_last_rank(): if eval_metric == 'loss': num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens num_original_tokens = data_loader.dataset.num_original_tokens val_loss = output / (num_tokenized_tokens - 1) ppl = math.exp(min(20, val_loss)) token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1) adjusted_ppl = math.exp(min(20, val_loss * token_ratio)) string += 'avg loss: {:.4E} | '.format(val_loss) string += 'ppl: {:.4E} | '.format(ppl) string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl) string += 'token ratio: {} |'.format(token_ratio) elif eval_metric == 'accuracy': num_examples = len(data_loader.dataset) acc = output / num_examples string += 'number correct: {:.4E} | '.format(output) string += 'total examples: {:.4E} | '.format(num_examples) string += 'avg accuracy: {:.4E}'.format(acc) else: raise NotImplementedError('evaluation method for {} metric is not ' 'implemented yet.'.format(eval_metric)) length = len(string) + 1 print('-' * length) print(string) print('-' * length)
def evaluate_and_print_results(prefix, forward_step_func, data_iterator, model, iteration, verbose=False): """Helper function to evaluate and dump results on screen.""" args = get_args() writer = get_tensorboard_writer() total_loss_dict = evaluate(forward_step_func, data_iterator, model, verbose) string = ' validation loss at {} | '.format(prefix) for key in total_loss_dict: string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item()) ppl = math.exp(min(20, total_loss_dict[key].item())) string += '{} PPL: {:.6E} | '.format(key, ppl) if writer and is_last_rank(): writer.add_scalar('{} value-validation'.format(key), total_loss_dict[key].item(), iteration) writer.add_scalar('{} ppl-validation'.format(key), ppl, iteration) writer.add_scalar('{} value-validation vs samples'.format(key), total_loss_dict[key].item(), args.consumed_train_samples) writer.add_scalar('{} ppl-validation vs samples'.format(key), ppl, args.consumed_train_samples) length = len(string) + 1 print_rank_last('-' * length) print_rank_last(string) print_rank_last('-' * length)
def add_balance_log(writer, iteration): from megatron import is_last_rank balance_dict_tensor = torch.vstack([ torch.tensor(item, device=item[0].device) for item in balance_dict.values() ]).detach() world_group = get_torch_default_comm() world_size = torch.distributed.get_world_size(group=world_group) torch.distributed.all_reduce(balance_dict_tensor, group=world_group) balance_dict_tensor /= world_size if writer and is_last_rank(): for idx, metric_name in enumerate(balance_dict): for layer_id, val in enumerate(balance_dict_tensor[idx]): writer.add_scalar(f"balance-{metric_name}/layer-{layer_id}", val.item(), iteration) writer.add_scalar( f"balance-{metric_name}/all", balance_dict_tensor[idx].mean().item(), iteration, ) reset_gate_hook()
def training_log(loss_dict, total_loss_dict, learning_rate, iteration, loss_scale, report_memory_flag, skipped_iter): """Log training information such as losses, timing, ....""" args = get_args() timers = get_timers() writer = get_tensorboard_writer() # Advanced, skipped, and Nan iterations. advanced_iters_key = 'advanced iterations' skipped_iters_key = 'skipped iterations' nan_iters_key = 'nan iterations' # Advanced iterations. if not skipped_iter: total_loss_dict[advanced_iters_key] = total_loss_dict.get( advanced_iters_key, 0) + 1 else: if advanced_iters_key not in total_loss_dict: total_loss_dict[advanced_iters_key] = 0 # Skipped iterations. total_loss_dict[skipped_iters_key] = total_loss_dict.get( skipped_iters_key, 0) + skipped_iter # Update losses and set nan iterations got_nan = False for key in loss_dict: if not skipped_iter: total_loss_dict[key] = total_loss_dict.get( key, torch.cuda.FloatTensor([0.0])) + loss_dict[key] else: value = loss_dict[key].float().sum().item() is_nan = value == float('inf') or \ value == -float('inf') or \ value != value got_nan = got_nan or is_nan total_loss_dict[nan_iters_key] = total_loss_dict.get(nan_iters_key, 0) + int(got_nan) # Logging. timers_to_log = [] def add_to_logging(name): if name in timers.timers: timers_to_log.append(name) add_to_logging('forward-compute') add_to_logging('forward-recv') add_to_logging('forward-send') add_to_logging('forward-send-backward-recv') add_to_logging('backward-compute') add_to_logging('backward-recv') add_to_logging('backward-send') add_to_logging('backward-send-forward-recv') add_to_logging('backward-params-all-reduce') add_to_logging('backward-embedding-all-reduce') add_to_logging('optimizer-copy-to-main-grad') add_to_logging('optimizer-unscale-and-check-inf') add_to_logging('optimizer-clip-main-grad') add_to_logging('optimizer-copy-main-to-model-params') add_to_logging('optimizer') add_to_logging('batch-generator') # Calculate batch size. batch_size = args.micro_batch_size * args.data_parallel_size * \ get_num_microbatches() total_iterations = total_loss_dict[advanced_iters_key] + \ total_loss_dict[skipped_iters_key] # Tensorboard values. if writer and is_last_rank(): writer.add_scalar('learning-rate', learning_rate, iteration) writer.add_scalar('learning-rate vs samples', learning_rate, args.consumed_train_samples) writer.add_scalar('batch-size', batch_size, iteration) writer.add_scalar('batch-size vs samples', batch_size, args.consumed_train_samples) for key in loss_dict: writer.add_scalar(key, loss_dict[key], iteration) writer.add_scalar(key + ' vs samples', loss_dict[key], args.consumed_train_samples) writer.add_scalar('loss-scale', loss_scale, iteration) writer.add_scalar('loss-scale vs samples', loss_scale, args.consumed_train_samples) timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) if iteration % args.log_interval == 0: elapsed_time = timers('interval time').elapsed() elapsed_time_per_iteration = elapsed_time / total_iterations if writer and torch.distributed.get_rank() == 0: writer.add_scalar('iteration-time', elapsed_time_per_iteration, iteration) log_string = ' iteration {:8d}/{:8d} |'.format(iteration, args.train_iters) log_string += ' consumed samples: {:12d} |'.format( args.consumed_train_samples) log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( elapsed_time_per_iteration * 1000.0) log_string += ' learning rate: {:.3E} |'.format(learning_rate) log_string += ' global batch size: {:5d} |'.format(batch_size) for key in total_loss_dict: if key not in [ advanced_iters_key, skipped_iters_key, nan_iters_key ]: avg = total_loss_dict[key].item() / \ float(max(1, total_loss_dict[advanced_iters_key])) if avg > 0.0: log_string += ' {}: {:.6E} |'.format(key, avg) total_loss_dict[key] = torch.cuda.FloatTensor([0.0]) log_string += ' loss scale: {:.1f} |'.format(loss_scale) log_string += ' number of skipped iterations: {:3d} |'.format( total_loss_dict[skipped_iters_key]) log_string += ' number of nan iterations: {:3d} |'.format( total_loss_dict[nan_iters_key]) total_loss_dict[advanced_iters_key] = 0 total_loss_dict[skipped_iters_key] = 0 total_loss_dict[nan_iters_key] = 0 print_rank_last(log_string) if report_memory_flag and learning_rate > 0.: # Report memory after optimizer state has been initialized. report_memory('(after {} iterations)'.format(iteration)) report_memory_flag = False timers.log(timers_to_log, normalizer=args.log_interval) return report_memory_flag