def create_run_result_file_list(config: PlotCrossValidationConfig, folder: str) -> List[RunResultFiles]: """ Creates a list of input files for cross validation analysis, from files stored inside of the test data folder. :param config: The overall cross validation config :param folder: The folder to read from, inside of test_data/plot_cross_validation. :return: """ full_folder = full_ml_test_data_path("plot_cross_validation") / folder files: List[RunResultFiles] = [] previous_dataset_file = None for split in ["0", "1"]: for mode in config.execution_modes_to_download(): metrics_file = full_folder / split / mode.value / METRICS_FILE_NAME dataset_file: Optional[ Path] = full_folder / split / DATASET_CSV_FILE_NAME if dataset_file.exists(): # type: ignore # Reduce amount of checked-in large files. dataset files can be large, and usually duplicate across # runs. Store only a copy in split 0, re-use in split 1. previous_dataset_file = dataset_file else: dataset_file = previous_dataset_file if metrics_file.exists(): file = RunResultFiles( execution_mode=mode, metrics_file=metrics_file, dataset_csv_file=dataset_file, run_recovery_id=config.run_recovery_id + "_" + split, # type: ignore split_index=split) files.append(file) return files
def test_check_result_file_counts() -> None: """ More tests on the function that checks the number of files of each ModeExecutionMode. """ val_files, plotting_config = load_result_files_for_classification() # This test assumes that the loaded val_files all have mode Val assert all(file.execution_mode == ModelExecutionMode.VAL for file in val_files) plotting_config.number_of_cross_validation_splits = len(val_files) # Check that when just the Val files are present, the check does not throw config_and_files1 = OfflineCrossvalConfigAndFiles(config=plotting_config, files=val_files) check_result_file_counts(config_and_files1) # Check that when we add the same number of Test files, the check does not throw test_files = [ RunResultFiles(execution_mode=ModelExecutionMode.TEST, metrics_file=file.metrics_file, dataset_csv_file=file.dataset_csv_file, run_recovery_id=file.run_recovery_id, split_index=file.split_index) for file in val_files ] config_and_files2 = OfflineCrossvalConfigAndFiles(config=plotting_config, files=val_files + test_files) check_result_file_counts(config_and_files2) # Check that when we have the same number of files as the number of splits, but they are from a mixture # of modes, the check does throw config_and_files3 = OfflineCrossvalConfigAndFiles(config=plotting_config, files=val_files[:1] + test_files[1:]) with pytest.raises(ValueError): check_result_file_counts(config_and_files3)
def copy_run_result_files(files: List[RunResultFiles], src_prefix_path: Path, dst_prefix_path: Path, transformer: Callable) -> List[RunResultFiles]: """ Copy dataset_csv_files from a list of RunResultFiles to a working directory, and then transform them using a callback. :param files: List of RunResultFiles to copy. :param src_prefix_path: Shared prefix path for the dataset_csv_files to be removed. :param dst_prefix_path: Shared prefix path to use for the copied dataset_csv_files. :param transformer: Callback function to apply to the copied dataset_csv_files. :return: New list of RunResultFiles pointing at the copied files. """ file_copies = [] files_copied = [] for file in files: if not file.dataset_csv_file: dataset_csv_file: Optional[Path] = None else: # Replace prefix path dst_dataset_csv_file = dst_prefix_path / file.dataset_csv_file.relative_to( src_prefix_path) if dst_dataset_csv_file not in files_copied: dst_dataset_csv_file.parent.mkdir(parents=True, exist_ok=True) shutil.copy(file.dataset_csv_file, dst_dataset_csv_file) files_copied.append(dst_dataset_csv_file) transformer(dst_dataset_csv_file) dataset_csv_file = dst_dataset_csv_file file_copy = RunResultFiles(execution_mode=file.execution_mode, metrics_file=file.metrics_file, dataset_csv_file=dataset_csv_file, run_recovery_id=file.run_recovery_id, split_index=file.split_index) file_copies.append(file_copy) return file_copies
def create_run_result_file_list( run_recovery_id: str, folder: str, perform_sub_fold_cross_validation: bool = False ) -> List[RunResultFiles]: """ Creates a list of input files for cross validation analysis, from files stored inside of the test data folder. :param run_recovery_id: The run recovery id, format experiment:run, without the split suffix (_0, _1) :param folder: The folder to read from, inside of test_data/plot_cross_validation. :param perform_sub_fold_cross_validation: If True then create input files for sub fold cross validation analysis. :return: """ full_folder = full_ml_test_data_path("plot_cross_validation") / folder files: List[RunResultFiles] = [] previous_dataset_file = None for split in ["0", "1", "1", "1" ] if perform_sub_fold_cross_validation else ["0", "1"]: for mode in EXECUTION_MODES_TO_DOWNLOAD: metrics_file = full_folder / split / mode.value / METRICS_FILE_NAME dataset_file: Optional[ Path] = full_folder / split / DATASET_CSV_FILE_NAME if dataset_file.exists(): # type: ignore # Reduce amount of checked-in large files. dataset files can be large, and usually duplicate across # runs. Store only a copy in split 0, re-use in split 1. previous_dataset_file = dataset_file else: dataset_file = previous_dataset_file if metrics_file.exists(): file = RunResultFiles(execution_mode=mode, metrics_file=metrics_file, dataset_csv_file=dataset_file, run_recovery_id=run_recovery_id + "_" + split, split_index=split) files.append(file) return files