Beispiel #1
0
def create_run_result_file_list(config: PlotCrossValidationConfig,
                                folder: str) -> List[RunResultFiles]:
    """
    Creates a list of input files for cross validation analysis, from files stored inside of the test data folder.
    :param config: The overall cross validation config
    :param folder: The folder to read from, inside of test_data/plot_cross_validation.
    :return:
    """
    full_folder = full_ml_test_data_path("plot_cross_validation") / folder
    files: List[RunResultFiles] = []
    previous_dataset_file = None
    for split in ["0", "1"]:
        for mode in config.execution_modes_to_download():
            metrics_file = full_folder / split / mode.value / METRICS_FILE_NAME
            dataset_file: Optional[
                Path] = full_folder / split / DATASET_CSV_FILE_NAME
            if dataset_file.exists():  # type: ignore
                # Reduce amount of checked-in large files. dataset files can be large, and usually duplicate across
                # runs. Store only a copy in split 0, re-use in split 1.
                previous_dataset_file = dataset_file
            else:
                dataset_file = previous_dataset_file
            if metrics_file.exists():
                file = RunResultFiles(
                    execution_mode=mode,
                    metrics_file=metrics_file,
                    dataset_csv_file=dataset_file,
                    run_recovery_id=config.run_recovery_id + "_" +
                    split,  # type: ignore
                    split_index=split)
                files.append(file)
    return files
Beispiel #2
0
def test_check_result_file_counts() -> None:
    """
    More tests on the function that checks the number of files of each ModeExecutionMode.
    """
    val_files, plotting_config = load_result_files_for_classification()
    # This test assumes that the loaded val_files all have mode Val
    assert all(file.execution_mode == ModelExecutionMode.VAL
               for file in val_files)
    plotting_config.number_of_cross_validation_splits = len(val_files)
    # Check that when just the Val files are present, the check does not throw
    config_and_files1 = OfflineCrossvalConfigAndFiles(config=plotting_config,
                                                      files=val_files)
    check_result_file_counts(config_and_files1)
    # Check that when we add the same number of Test files, the check does not throw
    test_files = [
        RunResultFiles(execution_mode=ModelExecutionMode.TEST,
                       metrics_file=file.metrics_file,
                       dataset_csv_file=file.dataset_csv_file,
                       run_recovery_id=file.run_recovery_id,
                       split_index=file.split_index) for file in val_files
    ]
    config_and_files2 = OfflineCrossvalConfigAndFiles(config=plotting_config,
                                                      files=val_files +
                                                      test_files)
    check_result_file_counts(config_and_files2)
    # Check that when we have the same number of files as the number of splits, but they are from a mixture
    # of modes, the check does throw
    config_and_files3 = OfflineCrossvalConfigAndFiles(config=plotting_config,
                                                      files=val_files[:1] +
                                                      test_files[1:])
    with pytest.raises(ValueError):
        check_result_file_counts(config_and_files3)
def copy_run_result_files(files: List[RunResultFiles], src_prefix_path: Path,
                          dst_prefix_path: Path,
                          transformer: Callable) -> List[RunResultFiles]:
    """
    Copy dataset_csv_files from a list of RunResultFiles to a working directory, and then
    transform them using a callback.

    :param files: List of RunResultFiles to copy.
    :param src_prefix_path: Shared prefix path for the dataset_csv_files to be removed.
    :param dst_prefix_path: Shared prefix path to use for the copied dataset_csv_files.
    :param transformer: Callback function to apply to the copied dataset_csv_files.
    :return: New list of RunResultFiles pointing at the copied files.
    """
    file_copies = []
    files_copied = []

    for file in files:
        if not file.dataset_csv_file:
            dataset_csv_file: Optional[Path] = None
        else:
            # Replace prefix path
            dst_dataset_csv_file = dst_prefix_path / file.dataset_csv_file.relative_to(
                src_prefix_path)
            if dst_dataset_csv_file not in files_copied:
                dst_dataset_csv_file.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy(file.dataset_csv_file, dst_dataset_csv_file)
                files_copied.append(dst_dataset_csv_file)
                transformer(dst_dataset_csv_file)
            dataset_csv_file = dst_dataset_csv_file

        file_copy = RunResultFiles(execution_mode=file.execution_mode,
                                   metrics_file=file.metrics_file,
                                   dataset_csv_file=dataset_csv_file,
                                   run_recovery_id=file.run_recovery_id,
                                   split_index=file.split_index)
        file_copies.append(file_copy)

    return file_copies
Beispiel #4
0
def create_run_result_file_list(
        run_recovery_id: str,
        folder: str,
        perform_sub_fold_cross_validation: bool = False
) -> List[RunResultFiles]:
    """
    Creates a list of input files for cross validation analysis, from files stored inside of the test data folder.
    :param run_recovery_id: The run recovery id, format experiment:run, without the split suffix (_0, _1)
    :param folder: The folder to read from, inside of test_data/plot_cross_validation.
    :param perform_sub_fold_cross_validation: If True then create input files for sub fold cross validation analysis.
    :return:
    """
    full_folder = full_ml_test_data_path("plot_cross_validation") / folder
    files: List[RunResultFiles] = []
    previous_dataset_file = None
    for split in ["0", "1", "1", "1"
                  ] if perform_sub_fold_cross_validation else ["0", "1"]:
        for mode in EXECUTION_MODES_TO_DOWNLOAD:
            metrics_file = full_folder / split / mode.value / METRICS_FILE_NAME
            dataset_file: Optional[
                Path] = full_folder / split / DATASET_CSV_FILE_NAME
            if dataset_file.exists():  # type: ignore
                # Reduce amount of checked-in large files. dataset files can be large, and usually duplicate across
                # runs. Store only a copy in split 0, re-use in split 1.
                previous_dataset_file = dataset_file
            else:
                dataset_file = previous_dataset_file
            if metrics_file.exists():
                file = RunResultFiles(execution_mode=mode,
                                      metrics_file=metrics_file,
                                      dataset_csv_file=dataset_file,
                                      run_recovery_id=run_recovery_id + "_" +
                                      split,
                                      split_index=split)
                files.append(file)
    return files