Beispiel #1
0
def test_plot_config() -> None:
    """
    Test that plotting configurations have the correct error handling.
    """
    with pytest.raises(ValueError):
        PlotCrossValidationConfig()
    PlotCrossValidationConfig(run_recovery_id="foo", epoch=1)
Beispiel #2
0
def test_save_outliers(test_config: PlotCrossValidationConfig,
                       test_output_dirs: OutputFolderForTests) -> None:
    """Test to make sure the outlier file for a split is as expected"""
    test_config.outputs_directory = test_output_dirs.root_dir
    test_config.outlier_range = 0
    assert test_config.run_recovery_id
    dataset_split_metrics = {
        x: _get_metrics_df(test_config.run_recovery_id, x)
        for x in [ModelExecutionMode.VAL]
    }
    save_outliers(test_config, dataset_split_metrics,
                  test_config.outputs_directory)
    filename = f"{ModelExecutionMode.VAL.value}_outliers.txt"
    assert_text_files_match(full_file=test_config.outputs_directory / filename,
                            expected_file=full_ml_test_data_path(filename))
    # Now test without the CSV_INSTITUTION_HEADER and CSV_SERIES_HEADER columns, which will be missing in institutions' environments
    dataset_split_metrics_pruned = {
        x: _get_metrics_df(test_config.run_recovery_id, x).drop(
            columns=[CSV_INSTITUTION_HEADER, CSV_SERIES_HEADER],
            errors="ignore")
        for x in [ModelExecutionMode.VAL]
    }
    save_outliers(test_config, dataset_split_metrics_pruned,
                  test_config.outputs_directory)
    test_data_filename = f"{ModelExecutionMode.VAL.value}_outliers_pruned.txt"
    assert_text_files_match(
        full_file=test_config.outputs_directory / filename,
        expected_file=full_ml_test_data_path(test_data_filename))
Beispiel #3
0
def test_save_outliers(test_config_ensemble: PlotCrossValidationConfig,
                       test_output_dirs: OutputFolderForTests) -> None:
    """Test to make sure the outlier file for a split is as expected"""
    test_config_ensemble.outputs_directory = test_output_dirs.root_dir
    test_config_ensemble.outlier_range = 0
    dataset_split_metrics = {x: _get_metrics_df(x) for x in [ModelExecutionMode.VAL]}
    save_outliers(test_config_ensemble, dataset_split_metrics, test_config_ensemble.outputs_directory)
    f = f"{ModelExecutionMode.VAL.value}_outliers.txt"
    assert_text_files_match(full_file=test_config_ensemble.outputs_directory / f,
                            expected_file=full_ml_test_data_path(f))
Beispiel #4
0
def test_add_comparison_data(test_config_comparison: PlotCrossValidationConfig) -> None:
    test_config_comparison.epoch = 2
    test_config_comparison.comparison_epochs = [2]
    metrics_df, root_folder = download_metrics(test_config_comparison)
    initial_metrics = pd.concat(list(metrics_df.values()))
    all_metrics, focus_splits = add_comparison_data(test_config_comparison, initial_metrics)
    focus_split = test_config_comparison.run_recovery_id
    comparison_split = test_config_comparison.comparison_run_recovery_ids[0]
    assert focus_splits == [focus_split]
    assert set(all_metrics.split) == {focus_split, comparison_split}
Beispiel #5
0
def test_save_outliers(test_config_ensemble: PlotCrossValidationConfig,
                       test_output_dirs: TestOutputDirectories) -> None:
    """Test to make sure the outlier file for a split is as expected"""
    test_config_ensemble.outputs_directory = test_output_dirs.root_dir
    test_config_ensemble.outlier_range = 0
    dataset_split_metrics = {x: _get_metrics_df(x) for x in [ModelExecutionMode.VAL]}
    save_outliers(test_config_ensemble, dataset_split_metrics, Path(test_config_ensemble.outputs_directory))
    assert_file_contents_match_exactly(full_file=Path(test_config_ensemble.outputs_directory)
                                                 / f"{ModelExecutionMode.VAL.value}_outliers.txt",
                                       expected_file=Path(
                                           full_ml_test_data_path(
                                               f"{ModelExecutionMode.VAL.value}_outliers.txt")))
Beispiel #6
0
def create_run_result_file_list(config: PlotCrossValidationConfig,
                                folder: str) -> List[RunResultFiles]:
    """
    Creates a list of input files for cross validation analysis, from files stored inside of the test data folder.
    :param config: The overall cross validation config
    :param folder: The folder to read from, inside of test_data/plot_cross_validation.
    :return:
    """
    full_folder = full_ml_test_data_path("plot_cross_validation") / folder
    files: List[RunResultFiles] = []
    previous_dataset_file = None
    for split in ["0", "1"]:
        for mode in config.execution_modes_to_download():
            metrics_file = full_folder / split / mode.value / METRICS_FILE_NAME
            dataset_file: Optional[
                Path] = full_folder / split / DATASET_CSV_FILE_NAME
            if dataset_file.exists():  # type: ignore
                # Reduce amount of checked-in large files. dataset files can be large, and usually duplicate across
                # runs. Store only a copy in split 0, re-use in split 1.
                previous_dataset_file = dataset_file
            else:
                dataset_file = previous_dataset_file
            if metrics_file.exists():
                file = RunResultFiles(
                    execution_mode=mode,
                    metrics_file=metrics_file,
                    dataset_csv_file=dataset_file,
                    run_recovery_id=config.run_recovery_id + "_" +
                    split,  # type: ignore
                    split_index=split)
                files.append(file)
    return files
def test_download_or_get_local_blobs(is_current_run: bool,
                                     test_config: PlotCrossValidationConfig,
                                     test_output_dirs: OutputFolderForTests) -> None:
    azure_config = get_default_azure_config()
    azure_config.get_workspace()
    assert test_config.run_recovery_id is not None
    run = Run.get_context() if is_current_run else azure_config.fetch_run(test_config.run_recovery_id)
    run_outputs_dir = full_ml_test_data_path() if is_current_run else Path(DEFAULT_AML_UPLOAD_DIR)
    test_config.outputs_directory = run_outputs_dir
    dst = test_config.download_or_get_local_file(
        blob_to_download="dataset.csv",
        destination=test_output_dirs.root_dir,
        run=run
    )
    assert dst is not None
    assert dst.exists()
Beispiel #8
0
def test_load_files_with_prediction_target() -> None:
    """
    For multi-week RNNs that predict at multiple sequence points: Test that the dataframes
    including the prediction_target column can be loaded.
    """
    folder = "multi_label_sequence_in_crossval"
    plotting_config = PlotCrossValidationConfig(
        run_recovery_id="foo",
        epoch=1,
        model_category=ModelCategory.Classification)
    files = create_run_result_file_list(plotting_config, folder)

    downloaded_metrics = load_dataframes(files, plotting_config)
    assert ModelExecutionMode.TEST not in downloaded_metrics
    metrics = downloaded_metrics[ModelExecutionMode.VAL]
    assert metrics is not None
    assert LoggingColumns.Hue.value in metrics
    # The prediction target column should always be read as a string, because we will later use it to create
    # hue values for a MetricsDict.
    assert is_string_dtype(metrics[LoggingColumns.Hue.value].dtype)
    assert LoggingColumns.Epoch.value in metrics
    assert LoggingColumns.Patient.value in metrics
    assert len(metrics[LoggingColumns.Hue.value].unique()) == 3
    # Each of the two CV folds has 2 distinct subjects
    assert len(metrics[LoggingColumns.Patient.value].unique()) == 4
Beispiel #9
0
def test_config_comparison() -> PlotCrossValidationConfig:
    return PlotCrossValidationConfig(
        run_recovery_id=DEFAULT_ENSEMBLE_RUN_RECOVERY_ID + "_0",
        epoch=1,
        comparison_run_recovery_ids=[DEFAULT_ENSEMBLE_RUN_RECOVERY_ID + "_1"],
        comparison_epochs=[1],
        model_category=ModelCategory.Segmentation)
def test_download_non_existing_file_in_crossval(test_output_dirs: OutputFolderForTests) -> None:
    """
    Downloading a non-existing file when trying to load cross validation results
    should not raise an exception.
    """
    run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)
    config = PlotCrossValidationConfig(run_recovery_id=None,
                                       model_category=ModelCategory.Classification,
                                       epoch=None,
                                       should_validate=False)
    config.outputs_directory = test_output_dirs.root_dir
    does_not_exist = "does_not_exist.txt"
    result = config.download_or_get_local_file(run,
                                               blob_to_download=does_not_exist,
                                               destination=test_output_dirs.root_dir)
    assert result is None
def test_config_comparison() -> PlotCrossValidationConfig:
    return PlotCrossValidationConfig(
        run_recovery_id=get_most_recent_run_id() + "_0",
        epoch=1,
        comparison_run_recovery_ids=[get_most_recent_run_id() + "_1"],
        model_category=ModelCategory.Segmentation
    )
def test_add_comparison_data() -> None:
    fallback_run = get_most_recent_run_id(
        fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)
    crossval_config = PlotCrossValidationConfig(
        run_recovery_id=fallback_run + "_0",
        epoch=1,
        comparison_run_recovery_ids=[fallback_run + "_1"],
        model_category=ModelCategory.Segmentation)
    crossval_config.epoch = 2
    metrics_df, root_folder = download_metrics(crossval_config)
    initial_metrics = pd.concat(list(metrics_df.values()))
    all_metrics, focus_splits = add_comparison_data(crossval_config,
                                                    initial_metrics)
    focus_split = crossval_config.run_recovery_id
    comparison_split = crossval_config.comparison_run_recovery_ids[0]
    assert focus_splits == [focus_split]
    assert set(all_metrics.split) == {focus_split, comparison_split}
Beispiel #13
0
def load_result_files_for_classification() -> \
        Tuple[List[RunResultFiles], PlotCrossValidationConfig]:
    plotting_config = PlotCrossValidationConfig(
        run_recovery_id="local_branch:HD_cfff5ceb-a227-41d6-a23c-0ebbc33b6301",
        epoch=3,
        model_category=ModelCategory.Classification)
    files = create_run_result_file_list(
        config=plotting_config,
        folder="HD_cfff5ceb-a227-41d6-a23c-0ebbc33b6301")
    return files, plotting_config
Beispiel #14
0
def load_result_files_for_classification(perform_sub_fold_cross_validation: bool = False) -> \
        Tuple[List[RunResultFiles], PlotCrossValidationConfig]:
    run_recovery_id = "local_branch:HD_cfff5ceb-a227-41d6-a23c-0ebbc33b6301"
    files = create_run_result_file_list(
        run_recovery_id=run_recovery_id,
        folder="HD_cfff5ceb-a227-41d6-a23c-0ebbc33b6301",
        perform_sub_fold_cross_validation=perform_sub_fold_cross_validation)
    plotting_config = PlotCrossValidationConfig(
        run_recovery_id=run_recovery_id,
        epoch=3,
        model_category=ModelCategory.Classification)
    return files, plotting_config
Beispiel #15
0
def test_download_or_get_local_file_2(
        test_output_dirs: OutputFolderForTests) -> None:
    config = PlotCrossValidationConfig(
        run_recovery_id=None,
        model_category=ModelCategory.Classification,
        epoch=None,
        should_validate=False)
    download_to_folder = test_output_dirs.root_dir / CROSSVAL_RESULTS_FOLDER
    config.outputs_directory = download_to_folder
    local_results = full_ml_test_data_path(
        "plot_cross_validation") / "HD_cfff5ceb-a227-41d6-a23c-0ebbc33b6301"
    config.local_run_results = str(local_results)
    # A file that sits in the root folder of the local_results should be downloaded into the
    # root of the download_to folder
    file1 = "dummy.txt"
    file_in_folder = config.download_or_get_local_file(None, file1,
                                                       download_to_folder)
    assert file_in_folder is not None
    assert file_in_folder == download_to_folder / file1

    # Copying a file in a sub-folder of the local_results: The full path to the file should be
    # preserved and created in the download_to folder.
    file2 = Path("0") / "Val" / "metrics.csv"
    file_in_folder = config.download_or_get_local_file(None, file2,
                                                       download_to_folder)
    assert file_in_folder is not None
    assert file_in_folder == download_to_folder / file2
Beispiel #16
0
def test_metrics_preparation_for_segmentation(test_config_ensemble: PlotCrossValidationConfig) -> None:
    """
    Test if metrics dataframes can be loaded and prepared. The files in question are checked in, but
    were downloaded from a run, ID given in DEFAULT_ENSEMBLE_RUN_RECOVERY_ID.
    """
    files = create_file_list_for_segmentation_recovery_run(test_config_ensemble)
    downloaded_metrics = load_dataframes(files, test_config_ensemble)
    for mode in test_config_ensemble.execution_modes_to_download():
        expected_df = _get_metrics_df(mode)
        # Drop the "mode" column, because that was added after creating the test data
        metrics = downloaded_metrics[mode]
        assert metrics is not None
        actual_df = metrics.drop(COL_MODE, axis=1)
        actual_df = actual_df.sort_values(list(actual_df.columns), ascending=True).reset_index(drop=True)
        pd.testing.assert_frame_equal(expected_df, actual_df, check_like=True, check_dtype=False)
Beispiel #17
0
def test_aggregate_files_with_prediction_target(test_output_dirs: TestOutputDirectories) -> None:
    """
    For multi-week RNNs that predict at multiple sequence points: Test that the dataframes
    including the prediction_target column can be aggregated.
    """
    plotting_config = PlotCrossValidationConfig(
        run_recovery_id="foo",
        epoch=1,
        model_category=ModelCategory.Classification
    )
    files = create_run_result_file_list(plotting_config, "multi_label_sequence_in_crossval")

    root_folder = Path(test_output_dirs.root_dir)
    print(f"Writing result files to {root_folder}")
    plot_cross_validation_from_files(OfflineCrossvalConfigAndFiles(config=plotting_config, files=files),
                                     root_folder=root_folder)
def test_metrics_preparation_for_segmentation(
        drop_column: Optional[str], test_config: PlotCrossValidationConfig,
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if metrics dataframes can be loaded and prepared. The files in question are checked in, but
    were downloaded from a run, ID given in DEFAULT_ENSEMBLE_RUN_RECOVERY_ID.
    Additionally test that CSV_INSTITUTION_HEADER or CSV_SERIES_HEADER can be dropped from the dataset_csv_file.
    """
    files = create_file_list_for_segmentation_recovery_run(test_config)
    if drop_column:

        def drop_csv_column(path: Path) -> None:
            """
            Load a csv file, drop a column, and save the csv file.
            :param path: Path to csv file.
            """
            df = pd.read_csv(path)
            dropped_df = df.drop(drop_column, axis=1)
            dropped_df.to_csv(path)

        files = copy_run_result_files(files, full_ml_test_data_path(),
                                      test_output_dirs.root_dir,
                                      drop_csv_column)
    downloaded_metrics = load_dataframes(files, test_config)
    assert test_config.run_recovery_id
    for mode in test_config.execution_modes_to_download():
        expected_df = _get_metrics_df(test_config.run_recovery_id, mode)
        if drop_column:
            # If dropped a column from dataset_csv_file, remove it from expected dataframe.
            expected_df[drop_column] = ''
        # Drop the "mode" column, because that was added after creating the test data
        metrics = downloaded_metrics[mode]
        assert metrics is not None
        actual_df = metrics.drop(COL_MODE, axis=1)
        actual_df = actual_df.sort_values(
            list(actual_df.columns), ascending=True).reset_index(drop=True)
        pd.testing.assert_frame_equal(expected_df,
                                      actual_df,
                                      check_like=True,
                                      check_dtype=False)
Beispiel #19
0
def test_config() -> PlotCrossValidationConfig:
    return PlotCrossValidationConfig(run_recovery_id=DEFAULT_RUN_RECOVERY_ID,
                                     epoch=1,
                                     model_category=ModelCategory.Segmentation)
Beispiel #20
0
def test_config() -> PlotCrossValidationConfig:
    return PlotCrossValidationConfig(run_recovery_id=get_most_recent_run_id(),
                                     epoch=1,
                                     model_category=ModelCategory.Segmentation)