Beispiel #1
0
def create_run_result_file_list(config: PlotCrossValidationConfig,
                                folder: str) -> List[RunResultFiles]:
    """
    Creates a list of input files for cross validation analysis, from files stored inside of the test data folder.
    :param config: The overall cross validation config
    :param folder: The folder to read from, inside of test_data/plot_cross_validation.
    :return:
    """
    full_folder = full_ml_test_data_path("plot_cross_validation") / folder
    files: List[RunResultFiles] = []
    previous_dataset_file = None
    for split in ["0", "1"]:
        for mode in config.execution_modes_to_download():
            metrics_file = full_folder / split / mode.value / METRICS_FILE_NAME
            dataset_file: Optional[
                Path] = full_folder / split / DATASET_CSV_FILE_NAME
            if dataset_file.exists():  # type: ignore
                # Reduce amount of checked-in large files. dataset files can be large, and usually duplicate across
                # runs. Store only a copy in split 0, re-use in split 1.
                previous_dataset_file = dataset_file
            else:
                dataset_file = previous_dataset_file
            if metrics_file.exists():
                file = RunResultFiles(
                    execution_mode=mode,
                    metrics_file=metrics_file,
                    dataset_csv_file=dataset_file,
                    run_recovery_id=config.run_recovery_id + "_" +
                    split,  # type: ignore
                    split_index=split)
                files.append(file)
    return files
Beispiel #2
0
def test_metrics_preparation_for_segmentation(test_config_ensemble: PlotCrossValidationConfig) -> None:
    """
    Test if metrics dataframes can be loaded and prepared. The files in question are checked in, but
    were downloaded from a run, ID given in DEFAULT_ENSEMBLE_RUN_RECOVERY_ID.
    """
    files = create_file_list_for_segmentation_recovery_run(test_config_ensemble)
    downloaded_metrics = load_dataframes(files, test_config_ensemble)
    for mode in test_config_ensemble.execution_modes_to_download():
        expected_df = _get_metrics_df(mode)
        # Drop the "mode" column, because that was added after creating the test data
        metrics = downloaded_metrics[mode]
        assert metrics is not None
        actual_df = metrics.drop(COL_MODE, axis=1)
        actual_df = actual_df.sort_values(list(actual_df.columns), ascending=True).reset_index(drop=True)
        pd.testing.assert_frame_equal(expected_df, actual_df, check_like=True, check_dtype=False)
def test_metrics_preparation_for_segmentation(
        drop_column: Optional[str], test_config: PlotCrossValidationConfig,
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if metrics dataframes can be loaded and prepared. The files in question are checked in, but
    were downloaded from a run, ID given in DEFAULT_ENSEMBLE_RUN_RECOVERY_ID.
    Additionally test that CSV_INSTITUTION_HEADER or CSV_SERIES_HEADER can be dropped from the dataset_csv_file.
    """
    files = create_file_list_for_segmentation_recovery_run(test_config)
    if drop_column:

        def drop_csv_column(path: Path) -> None:
            """
            Load a csv file, drop a column, and save the csv file.
            :param path: Path to csv file.
            """
            df = pd.read_csv(path)
            dropped_df = df.drop(drop_column, axis=1)
            dropped_df.to_csv(path)

        files = copy_run_result_files(files, full_ml_test_data_path(),
                                      test_output_dirs.root_dir,
                                      drop_csv_column)
    downloaded_metrics = load_dataframes(files, test_config)
    assert test_config.run_recovery_id
    for mode in test_config.execution_modes_to_download():
        expected_df = _get_metrics_df(test_config.run_recovery_id, mode)
        if drop_column:
            # If dropped a column from dataset_csv_file, remove it from expected dataframe.
            expected_df[drop_column] = ''
        # Drop the "mode" column, because that was added after creating the test data
        metrics = downloaded_metrics[mode]
        assert metrics is not None
        actual_df = metrics.drop(COL_MODE, axis=1)
        actual_df = actual_df.sort_values(
            list(actual_df.columns), ascending=True).reset_index(drop=True)
        pd.testing.assert_frame_equal(expected_df,
                                      actual_df,
                                      check_like=True,
                                      check_dtype=False)