# LTFB is evaluating on the correct models. tol = 1e-4 for step in range(_num_epochs - 1): for trainer in range(num_trainers): partner = ltfb_partners[trainer][step] winner = ltfb_winners[trainer][step] local_val = tournament_metrics[trainer][2 * step] partner_val = tournament_metrics[trainer][2 * step + 1] winner_val = validation_metrics[trainer][step + 1] true_local_val = validation_metrics[trainer][step] true_partner_val = validation_metrics[partner][step] true_winner_val = validation_metrics[winner][step] assert true_local_val-tol < local_val < true_local_val+tol, \ 'Incorrect metric value for LTFB local model' assert true_partner_val-tol < partner_val < true_partner_val+tol, \ 'Incorrect metric value for LTFB partner model' assert true_winner_val-tol < winner_val < true_winner_val+tol, \ 'Incorrect metric value for LTFB winner model' # Return test function from factory function func.__name__ = test_name return func # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, nodes=2, lbann_args='--procs_per_trainer=2'): globals()[_test_func.__name__] = augment_test_func(_test_func)
The Python data reader will import the current Python file to access the sample access functions. Args: lbann (module): Module for LBANN Python frontend """ # Note: The training data reader should be removed when # https://github.com/LLNL/lbann/issues/1098 is resolved. message = lbann.reader_pb2.DataReader() message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train') ]) message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test') ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test
err = 0 err_dirs = '' fileList = glob.glob('{base}/trainer0/*'.format(base=checkpoint_ckpt)) fileList, tmp_err, tmp_err_str = tools.multidir_diff( checkpoint_ckpt, restart_ckpt, fileList) err += tmp_err err_dirs += tmp_err_str err_msg = "\nUnmatched checkpoints:\n" for f in fileList: err_msg += f + "\n" assert len(fileList) == 0, \ 'Extra checkpoint data in baseline directory: ' + err_msg assert err == 0, err_dirs # Return test function from factory function func.__name__ = test_name return func # Create test functions that can interact with PyTest for _test_func in tools.create_tests( setup_experiment, __file__, test_name_base=test_name_base, nodes=num_nodes, work_subdir='baseline', lbann_args=['--disable_cuda', ' --num_epochs=' + str(num_ckpt_epochs)]): globals()[_test_func.__name__] = create_test_func(_test_func)
def func(cluster, dirname, weekly): # Run LBANN experiment baseline print( '\n################################################################################' ) print('Running model halfway ') print( '################################################################################\n' ) baseline_test_output = test_func(cluster, dirname) baseline_training_metrics = tools.collect_metrics_from_log_func( baseline_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function') baseline_validation_metrics = tools.collect_metrics_from_log_func( baseline_test_output['stdout_log_file'], 'validation objective function') baseline_test_metrics = tools.collect_metrics_from_log_func( baseline_test_output['stdout_log_file'], 'test objective function') # Run LBANN model to checkpoint print( '\n################################################################################' ) print('Running model to checkpointed weights') print( '################################################################################\n' ) test_func_checkpoint = tools.create_tests( setup_experiment, __file__, test_name_base=test_name_base, nodes=num_nodes, work_subdir='reload_weights_from_checkpoint', lbann_args=[ '--disable_cuda', '--num_epochs=' + str(num_restart_epochs), '--load_model_weights_dir=' + os.path.join(baseline_test_output['work_dir'], checkpoint_dir, 'trainer0') ], ) checkpoint_test_output = test_func_checkpoint[0](cluster, dirname) checkpoint_training_metrics = tools.collect_metrics_from_log_func( checkpoint_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function') checkpoint_validation_metrics = tools.collect_metrics_from_log_func( checkpoint_test_output['stdout_log_file'], 'validation objective function') checkpoint_test_metrics = tools.collect_metrics_from_log_func( checkpoint_test_output['stdout_log_file'], 'test objective function') print( '\n################################################################################' ) print('Running model from save_model weights') print( '################################################################################\n' ) test_func_restart = tools.create_tests( setup_experiment, __file__, test_name_base=test_name_base, nodes=num_nodes, work_subdir='reload_weights_from_save_model_cb', lbann_args=[ '--disable_cuda', '--num_epochs=' + str(num_restart_epochs), '--load_model_weights_dir=' + os.path.join(baseline_test_output['work_dir'], save_model_dir, 'trainer0', 'model0/'), '--load_model_weights_dir_is_complete' ], ) # Restart LBANN model and run to completion restart_test_output = test_func_restart[0](cluster, dirname) restart_training_metrics = tools.collect_metrics_from_log_func( restart_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function') restart_validation_metrics = tools.collect_metrics_from_log_func( restart_test_output['stdout_log_file'], 'validation objective function') restart_test_metrics = tools.collect_metrics_from_log_func( restart_test_output['stdout_log_file'], 'test objective function') print( '\n################################################################################' ) print('Comparing results of models') print( '################################################################################\n' ) # Check if metrics are same in baseline and test experiments # Note: "Print statistics" callback will print up to 6 digits # of metric values. # Comparing training objective functions tools.compare_metrics(checkpoint_training_metrics, restart_training_metrics) # Comparing validation objective functions tools.compare_metrics(checkpoint_validation_metrics, restart_validation_metrics) # Comparing test objective functions tools.compare_metrics(checkpoint_test_metrics, restart_test_metrics) baseline_ckpt = os.path.join(baseline_test_output['work_dir'], checkpoint_dir) checkpoint_ckpt = os.path.join(checkpoint_test_output['work_dir'], checkpoint_dir) restart_ckpt = os.path.join(restart_test_output['work_dir'], checkpoint_dir) err = 0 err_dirs = '' fileList = glob.glob('{base}/trainer0/*'.format(base=checkpoint_ckpt)) fileList, tmp_err, tmp_err_str = tools.multidir_diff( checkpoint_ckpt, restart_ckpt, fileList) err += tmp_err err_dirs += tmp_err_str err_msg = "\nUnmatched checkpoints:\n" for f in fileList: err_msg += f + "\n" assert len(fileList) == 0, \ 'Extra checkpoint data in baseline directory: ' + err_msg assert err == 0, err_dirs
#Only tested on Ray. Skip if mini-batch test on another cluster. Change this when mini-batch values are available for other clusters # Check if mini-batch time is within expected range # Note: Skip first epoch since its runtime is usually an outlier mini_batch_times = mini_batch_times[1:] mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) assert (0.75 * expected_mini_batch_times[cluster] < mini_batch_time < 1.25 * expected_mini_batch_times[cluster]), \ 'average mini-batch time is outside expected range' # Check for GPU usage and memory leaks # Note: Skip first epoch gpu_usages = gpu_usages[1:] gpu_usage = sum(gpu_usages)/len(gpu_usages) assert (0.75 * expected_gpu_usage[cluster] < gpu_usage < 1.25 * expected_gpu_usage[cluster]),\ 'average gpu usage is outside expected range' # Return test function from factory function func.__name__ = test_name return func # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, lbann_args=['--num_io_threads=1'], nodes=compute_nodes): globals()[_test_func.__name__] = augment_test_func(_test_func)
Args: lbann (module): Module for LBANN Python frontend """ # Note: The training data reader should be removed when # https://github.com/LLNL/lbann/issues/1098 is resolved. message = lbann.reader_pb2.DataReader() message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train') ]) message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test') ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest # Note: Create test name by removing ".py" from file name _test_name = os.path.splitext(os.path.basename(current_file))[0] for test in tools.create_tests(setup_experiment, _test_name, environment=tools.get_distconv_environment()): globals()[test.__name__] = test
The Python data reader will import the current Python file to access the sample access functions. Args: lbann (module): Module for LBANN Python frontend """ # Note: The training data reader should be removed when # https://github.com/LLNL/lbann/issues/1098 is resolved. message = lbann.reader_pb2.DataReader() message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train') ]) message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test') ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest for test in tools.create_tests(setup_experiment, __file__, procs_per_node=4): globals()[test.__name__] = test
The Python data reader will import the current Python file to access the sample access functions. Args: lbann (module): Module for LBANN Python frontend """ # Note: The training data reader should be removed when # https://github.com/LLNL/lbann/issues/1098 is resolved. message = lbann.reader_pb2.DataReader() message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train') ]) message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test') ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__): globals()[_test_func.__name__] = _test_func
fileList = glob.glob('{base}/trainer0/*'.format(base=baseline_ckpt)) fileList, tmp_err, tmp_err_str = tools.multidir_diff( baseline_ckpt, restart_ckpt, fileList) err += tmp_err err_dirs += tmp_err_str fileList, tmp_err, tmp_err_str = tools.multidir_diff( baseline_ckpt, checkpoint_ckpt, fileList) err += tmp_err err_dirs += tmp_err_str err_msg = "\nUnmatched checkpoints:\n" for f in fileList: err_msg += f + "\n" assert len(fileList) == 0, \ 'Extra checkpoint data in baseline directory: ' + err_msg assert err == 0, err_dirs # Return test function from factory function func.__name__ = test_name return func # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, test_name_base=test_name_base, nodes=num_nodes, work_subdir='baseline', lbann_args=['--disable_cuda=True']): globals()[_test_func.__name__] = create_test_func(_test_func)
'train reconstruction error is outside expected range' # Check if testing reconstruction is within expected range assert (expected_test_pc_range[0] < test_pc < expected_test_pc_range[1]), \ 'test reconstruction error is outside expected range' # Check if mini-batch time is within expected range # Note: Skip first epoch since its runtime is usually an outlier mini_batch_times = mini_batch_times[1:] mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) assert (0.75 * expected_mini_batch_times[cluster] < mini_batch_time < 1.25 * expected_mini_batch_times[cluster]), \ 'average mini-batch time is outside expected range' # Return test function from factory function func.__name__ = test_name return func m_lbann_args = f"--use_data_store --preload_data_store --metadata={metadata_prototext}" # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, lbann_args=[m_lbann_args], procs_per_node=procs_per_node, nodes=num_nodes): globals()[_test_func.__name__] = augment_test_func(_test_func)
access the sample access functions. Args: lbann (module): Module for LBANN Python frontend """ # Note: The training data reader should be removed when # https://github.com/LLNL/lbann/issues/1098 is resolved. message = lbann.reader_pb2.DataReader() message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train') ]) message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test') ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest # Note: Create test name by removing ".py" from file name _test_name = os.path.splitext(os.path.basename(current_file))[0] for _test_func in tools.create_tests(setup_experiment, _test_name): globals()[_test_func.__name__] = _test_func
Args: lbann (module): Module for LBANN Python frontend """ # Note: The training data reader should be removed when # https://github.com/LLNL/lbann/issues/1098 is resolved. message = lbann.reader_pb2.DataReader() message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train') ]) message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test') ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest ### @todo Run on >1 proc when https://github.com/LLNL/lbann/issues/1548 is resolved for _test_func in tools.create_tests(setup_experiment, __file__, procs_per_node=1, nodes=1): globals()[_test_func.__name__] = _test_func
tools.create_python_data_reader( lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train' ) ]) message.reader.extend([ tools.create_python_data_reader( lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test' ) ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest # Note: Create test name by removing ".py" from file name _test_name = os.path.splitext(os.path.basename(current_file))[0] for test in tools.create_tests(setup_experiment, _test_name, procs_per_node=4): globals()[test.__name__] = test
if (cluster == 'ray'): # Check if mini-batch time is within expected range # Note: Skip first epoch since its runtime is usually an outlier mini_batch_times = mini_batch_times[1:] mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) assert (0.75 * expected_mini_batch_times[cluster] < mini_batch_time < 1.25 * expected_mini_batch_times[cluster]), \ 'average mini-batch time is outside expected range' # Check for GPU usage and memory leaks # Note: Skip first epoch gpu_usages = gpu_usages[1:] gpu_usage = sum(gpu_usages) / len(gpu_usages) assert (0.75 * expected_gpu_usage[cluster] < gpu_usage < 1.25 * expected_gpu_usage[cluster]),\ 'average gpu usage is outside expected range' # Return test function from factory function func.__name__ = test_name return func # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, nodes=num_nodes): globals()[_test_func.__name__] = augment_test_func(_test_func)
access the sample access functions. Args: lbann (module): Module for LBANN Python frontend """ # Note: The training data reader should be removed when # https://github.com/LLNL/lbann/issues/1098 is resolved. message = lbann.reader_pb2.DataReader() message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train') ]) message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test') ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest # Note: Create test name by removing ".py" from file name _test_name = os.path.splitext(os.path.basename(current_file))[0] for test in tools.create_tests(setup_experiment, _test_name): globals()[test.__name__] = test
< expected_train_accuracy_range[1]), \ 'train accuracy is outside expected range' # Check if testing accuracy is within expected range assert (expected_test_accuracy_range[0] < test_accuracy < expected_test_accuracy_range[1]), \ 'test accuracy is outside expected range' # Check if mini-batch time is within expected range # Note: Skip first epoch since its runtime is usually an outlier mini_batch_times = mini_batch_times[1:] mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) assert (0.75 * expected_mini_batch_times[cluster] < mini_batch_time < 1.25 * expected_mini_batch_times[cluster]), \ 'average mini-batch time is outside expected range' # Return test function from factory function func.__name__ = test_name return func # Create test functions that can interact with PyTest for _test_func in tools.create_tests( setup_experiment, __file__, nodes=num_nodes, lbann_args=['--load_full_sample_list_once']): globals()[_test_func.__name__] = augment_test_func(_test_func)
Args: lbann (module): Module for LBANN Python frontend """ # Note: The training data reader should be removed when # https://github.com/LLNL/lbann/issues/1098 is resolved. message = lbann.reader_pb2.DataReader() message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train') ]) message.reader.extend([ tools.create_python_data_reader(lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test') ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest for _test_func in tools.create_tests( setup_experiment, __file__, environment={"LBANN_KEEP_ERROR_SIGNALS": 1}, ): globals()[_test_func.__name__] = _test_func
message.reader.extend([ tools.create_python_data_reader( lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train' ) ]) message.reader.extend([ tools.create_python_data_reader( lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test' ) ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, environment=tools.get_distconv_environment()): globals()[_test_func.__name__] = _test_func
message = lbann.reader_pb2.DataReader() message.reader.extend([ tools.create_python_data_reader( lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'train' ) ]) message.reader.extend([ tools.create_python_data_reader( lbann, current_file, 'get_sample', 'num_samples', 'sample_dims', 'test' ) ]) return message # ============================================== # Setup PyTest # ============================================== # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, skip_clusters=["corona"]): globals()[_test_func.__name__] = _test_func