image_prediction_method = 'as_production'
predictions_unique_name = 'test_set_CV1_0'
predictions_folder_name = 'subsets'  #this could be CV or exploratory_exp, all 3 of them can have prediction files

predictions_path = build_path_results(results_path,
                                      dataset_name,
                                      pooling_operator,
                                      script_suffix=predictions_folder_name,
                                      result_suffix='predictions')
performance_path = build_path_results(results_path,
                                      dataset_name,
                                      pooling_operator,
                                      script_suffix=predictions_folder_name,
                                      result_suffix='performance')
make_directory(performance_path)

pool_dict = {'nor': "nor", "lse": "lse", "lse01": "lse", "max": "max"}
r = {'nor': 0, "lse": 1.0, "lse01": 0.1, "max": 0}

image_labels, image_predictions, \
has_bbox, accurate_localizations, dice_scores = keras_preds.process_prediction(config,
                                                                               predictions_unique_name,
                                                                               predictions_path,
                                                                               r=r[pooling_operator],
                                                                               pool_method=pool_dict[pooling_operator],
                                                                               img_pred_method=image_prediction_method,
                                                                               threshold_binarization=0.5,
                                                                               iou_threshold=0.1)

keras_preds.save_generated_files(predictions_path, predictions_unique_name,
Esempio n. 2
0
predictions_path = build_path_results(res_path,
                                      dataset_name,
                                      pooling_operator,
                                      script_suffix=parent_folder_predictions,
                                      result_suffix='predictions')
performance_path = build_path_results(res_path,
                                      dataset_name,
                                      pooling_operator,
                                      script_suffix=parent_folder_predictions,
                                      result_suffix='performance')
stability_path = build_path_results(res_path,
                                    dataset_name,
                                    pooling_operator,
                                    script_suffix=parent_folder_predictions,
                                    result_suffix='stability')
make_directory(stability_path)

if use_xray:
    instance_labels_collection, image_index_collection, raw_predictions_collection, bag_labels_collection, \
    bag_predictions_collection, identifier = load_and_filter_predictions(classifiers,
                                                                         only_segmentation_images=False,
                                                                         only_positive_images=True,
                                                                         predictions_path=predictions_path)

    pos_jacc, corr_pos_jacc, corr_pos_jacc_heur, pos_overlap, corr_pos_overlap, corr_iou, \
    pearson_correlation, spearman_rank_correlation = compute_stability_scores(raw_predictions_collection)

    generate_visualizations_stability(
        config,
        visualize_per_image=False,
        pos_jacc=pos_jacc,
def cross_validation(config, number_splits=5):
    """
    performs cross validation on a specific architecture
    :param config: yaml config file
    :param number_splits: number of different cross validation splits to test on
    :return: Returns predictions, image indices and patch labels saved in .npy file for train,test and validation set
    and for each CV split.
    """
    skip_processing = config['skip_processing_labels']
    image_path = config['image_path']
    classication_labels_path = config['classication_labels_path']
    localization_labels_path = config['localization_labels_path']
    results_path = config['results_path']
    train_mode = config['train_mode']
    dataset_name = config['dataset_name']
    class_name = config['class_name']
    mura_test_img_path = config['mura_test_img_path']
    mura_train_labels_path = config['mura_train_labels_path']
    mura_train_img_path = config['mura_train_img_path']
    mura_test_labels_path = config['mura_test_labels_path']
    mura_processed_train_labels_path = config[
        'mura_processed_train_labels_path']
    mura_processed_test_labels_path = config['mura_processed_test_labels_path']
    mura_interpolation = config['mura_interpolation']
    pascal_image_path = config['pascal_image_path']
    resized_images_before_training = config['resized_images_before_training']

    nr_epochs = config['nr_epochs']
    lr = config['lr']
    reg_weight = config['reg_weight']
    pooling_operator = config['pooling_operator']

    use_xray, use_pascal = set_dataset_flag(dataset_name)

    script_suffix = 'CV'
    trained_models_path = build_path_results(results_path,
                                             dataset_name,
                                             pooling_operator,
                                             script_suffix=script_suffix,
                                             result_suffix='trained_models')
    prediction_results_path = build_path_results(results_path,
                                                 dataset_name,
                                                 pooling_operator,
                                                 script_suffix=script_suffix,
                                                 result_suffix='predictions')
    make_directory(trained_models_path)
    make_directory(prediction_results_path)

    if use_xray:
        if resized_images_before_training:
            xray_df = fetch_preprocessed_images_csv(image_path,
                                                    'processed_imgs')
            #todo: delete - just for testing
            # xray_df = xray_df[-50:]
        else:
            xray_df = load_process_xray14(config)
    elif use_pascal:
        pascal_df = load_pascal(pascal_image_path)

    else:
        df_train_val, test_df_all_classes = load_mura(
            skip_processing, mura_processed_train_labels_path,
            mura_processed_test_labels_path, mura_train_img_path,
            mura_train_labels_path, mura_test_labels_path, mura_test_img_path)

    for split in range(0, number_splits):

        if use_xray:
            df_train, df_val, df_test, _, _, _ = ld.split_xray_cv(
                xray_df, number_splits, split, class_name)

        elif use_pascal:
            df_train, df_val, df_test = construct_train_test_cv(
                pascal_df, number_splits, split)

        else:
            df_train, df_val = split_data_cv(df_train_val,
                                             number_splits,
                                             split,
                                             random_seed=1,
                                             diagnose_col=class_name,
                                             ratio_to_keep=None)
            # df_test = filter_rows_on_class(test_df_all_classes, class_name=class_name)
            df_test = filter_rows_and_columns(test_df_all_classes, class_name)

        if train_mode:
            tf.keras.backend.clear_session()
            K.clear_session()

            ############################################ TRAIN ###########################################################
            train_generator = gen.BatchGenerator(
                instances=df_train.values,
                resized_image=resized_images_before_training,
                batch_size=BATCH_SIZE,
                net_h=IMAGE_SIZE,
                net_w=IMAGE_SIZE,
                norm=keras_utils.normalize,
                box_size=BOX_SIZE,
                processed_y=skip_processing,
                interpolation=mura_interpolation,
                shuffle=True)

            valid_generator = gen.BatchGenerator(
                instances=df_val.values,
                resized_image=resized_images_before_training,
                batch_size=BATCH_SIZE,
                net_h=IMAGE_SIZE,
                net_w=IMAGE_SIZE,
                box_size=BOX_SIZE,
                norm=keras_utils.normalize,
                processed_y=skip_processing,
                interpolation=mura_interpolation,
                shuffle=True)
            model = keras_model.build_model(reg_weight)

            model = keras_model.compile_model_accuracy(
                model, lr, pool_op=pooling_operator)

            #   checkpoint on every epoch is not really needed here, not used, CALLBACK REMOVED from the generator
            filepath = trained_models_path + "CV_" + str(
                split) + "_epoch-{epoch:02d}-{val_loss:.2f}.hdf5"
            checkpoint_on_epoch_end = ModelCheckpoint(filepath,
                                                      monitor='val_loss',
                                                      verbose=1,
                                                      save_best_only=False,
                                                      mode='min')

            lrate = LearningRateScheduler(keras_model.step_decay, verbose=1)
            print("df train STEPS")
            print(len(df_train) // BATCH_SIZE)
            print(train_generator.__len__())

            history = model.fit_generator(
                generator=train_generator,
                steps_per_epoch=train_generator.__len__(),
                epochs=nr_epochs,
                validation_data=valid_generator,
                validation_steps=valid_generator.__len__(),
                verbose=1,
                callbacks=[checkpoint_on_epoch_end])

            print("history")
            print(history.history)
            print(history.history['keras_accuracy'])
            np.save(trained_models_path + 'train_info_' + str(split) + '.npy',
                    history.history)

            settings = np.array({
                'lr: ': lr,
                'reg_weight: ': reg_weight,
                'pooling_operator: ': pooling_operator
            })
            np.save(trained_models_path + 'train_settings.npy', settings)
            keras_utils.plot_train_validation(history.history['loss'],
                                              history.history['val_loss'],
                                              'train loss', 'validation loss',
                                              'CV_loss' + str(split), 'loss',
                                              trained_models_path)

            ############################################    PREDICTIONS      #############################################
            predict_patch_and_save_results(model, 'test_set_CV' + str(split),
                                           df_test, skip_processing,
                                           BATCH_SIZE_TEST, BOX_SIZE,
                                           IMAGE_SIZE, prediction_results_path,
                                           mura_interpolation,
                                           resized_images_before_training)
            predict_patch_and_save_results(model, 'train_set_CV' + str(split),
                                           df_train, skip_processing,
                                           BATCH_SIZE_TEST, BOX_SIZE,
                                           IMAGE_SIZE, prediction_results_path,
                                           mura_interpolation,
                                           resized_images_before_training)
            predict_patch_and_save_results(model, 'val_set_CV' + str(split),
                                           df_val, skip_processing,
                                           BATCH_SIZE_TEST, BOX_SIZE,
                                           IMAGE_SIZE, prediction_results_path,
                                           mura_interpolation,
                                           resized_images_before_training)
            ##### EVALUATE function

            print("evaluate validation")
            evaluate = model.evaluate_generator(
                generator=valid_generator,
                steps=valid_generator.__len__(),
                verbose=1)

            evaluate_train = model.evaluate_generator(
                generator=train_generator,
                steps=train_generator.__len__(),
                verbose=1)
            test_generator = gen.BatchGenerator(
                instances=df_test.values,
                resized_image=resized_images_before_training,
                batch_size=BATCH_SIZE,
                net_h=IMAGE_SIZE,
                net_w=IMAGE_SIZE,
                shuffle=True,
                norm=keras_utils.normalize,
                box_size=BOX_SIZE,
                processed_y=skip_processing,
                interpolation=mura_interpolation)

            evaluate_test = model.evaluate_generator(
                generator=test_generator,
                steps=test_generator.__len__(),
                verbose=1)
            print("Evaluate Train")
            print(evaluate_train)
            print("Evaluate Valid")
            print(evaluate)
            print("Evaluate test")
            print(evaluate_test)
        else:
            files_found = 0
            print(trained_models_path)
            for file_path in Path(trained_models_path).glob(
                    "CV_patient_split_" + str(split) + "*.hdf5"):
                print(file_path)
                files_found += 1

            assert files_found == 1, "No model found/ Multiple models found, not clear which to use "
            print(str(files_found))
            model = load_model(str(file_path),
                               custom_objects={
                                   'keras_loss_v3_nor': keras_loss_v3_nor,
                                   'keras_accuracy': keras_accuracy,
                                   'accuracy_asloss': accuracy_asloss
                               })
            model = keras_model.compile_model_accuracy(model, lr,
                                                       pooling_operator)

            predict_patch_and_save_results(
                model, "train_set_CV" + (str(split)), df_train,
                skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE,
                prediction_results_path, mura_interpolation,
                resized_images_before_training)
            predict_patch_and_save_results(model, "val_set_CV" + (str(split)),
                                           df_val, skip_processing,
                                           BATCH_SIZE_TEST, BOX_SIZE,
                                           IMAGE_SIZE, prediction_results_path,
                                           mura_interpolation,
                                           resized_images_before_training)
            predict_patch_and_save_results(model, "test_set_CV" + (str(split)),
                                           df_test, skip_processing,
                                           BATCH_SIZE_TEST, BOX_SIZE,
                                           IMAGE_SIZE, prediction_results_path,
                                           mura_interpolation,
                                           resized_images_before_training)
def train_on_subsets(config, number_splits, CV_split_to_use,
                     number_classifiers, subset_seeds, overlap_ratio):
    """
    Trains several classifiers with similar training set, while preserving test and validation set the same.
    The aim is to compare the performance of these classifiers later in stability module.
    The script takes a specific cross validation split of training, validation and testing set, and then drops a
    portion of the samples from the training set. Validation and test set are not changed - they are as the original
    split. Then the script trains a classifier with each of the training subsets.
    :param config: yaml config file
    :param number_splits: number of cross validation  splits used in cross validation (CV) (run_cross_validation.py)
    :param CV_split_to_use: specific CV split for defining  train/test/validation set. Value is between [0, number_splits-1]
    :param number_classifiers: number of classifiers to train
    :param subset_seeds: seeds used to drop observations from original training set.
    :param overlap_ratio:  ration of observations which are preserved from the original training set, defined by the
    specific CV split.
    :return: Returns saved .npy file for the predictions, image_indices and patch labels for the train/test/valid set for
    each subset.
    """
    skip_processing = config['skip_processing_labels']
    image_path = config['image_path']
    classication_labels_path = config['classication_labels_path']
    localization_labels_path = config['localization_labels_path']
    results_path = config['results_path']
    processed_labels_path = config['processed_labels_path']
    train_mode = config['train_mode']
    dataset_name = config['dataset_name']
    class_name = config['class_name']
    mura_test_img_path = config['mura_test_img_path']
    mura_train_labels_path = config['mura_train_labels_path']
    mura_train_img_path = config['mura_train_img_path']
    mura_test_labels_path = config['mura_test_labels_path']
    mura_processed_train_labels_path = config[
        'mura_processed_train_labels_path']
    mura_processed_test_labels_path = config['mura_processed_test_labels_path']
    mura_interpolation = config['mura_interpolation']
    pascal_image_path = config['pascal_image_path']
    resized_images_before_training = config['resized_images_before_training']

    nr_epochs = config['nr_epochs']
    lr = config['lr']
    reg_weight = config['reg_weight']
    pooling_operator = config['pooling_operator']

    IMAGE_SIZE = 512
    BATCH_SIZE = 10
    BATCH_SIZE_TEST = 1
    BOX_SIZE = 16

    use_xray, use_pascal = set_dataset_flag(dataset_name)
    script_suffix = 'subsets'
    trained_models_path = build_path_results(results_path,
                                             dataset_name,
                                             pooling_operator,
                                             script_suffix=script_suffix,
                                             result_suffix='trained_models')
    prediction_results_path = build_path_results(results_path,
                                                 dataset_name,
                                                 pooling_operator,
                                                 script_suffix=script_suffix,
                                                 result_suffix='predictions')
    make_directory(trained_models_path)
    make_directory(prediction_results_path)

    if use_xray:
        if resized_images_before_training:
            xray_df = fetch_preprocessed_images_csv(image_path,
                                                    'processed_imgs')
            # todo: delete - just for testing
            # xray_df = xray_df[-50:]

        else:
            xray_df = load_xray(skip_processing, processed_labels_path,
                                classication_labels_path, image_path,
                                localization_labels_path, results_path)
        xray_df = ld.filter_observations(xray_df, class_name, 'No Finding')

    elif use_pascal:
        pascal_df = load_pascal(pascal_image_path)
    else:
        df_train_val, test_df_all_classes = load_mura(
            skip_processing, mura_processed_train_labels_path,
            mura_processed_test_labels_path, mura_train_img_path,
            mura_train_labels_path, mura_test_labels_path, mura_test_img_path)

    for split in range(0, number_splits):
        if use_xray:
            df_train, df_val, df_test, df_bbox_train, \
            df_bbox_test, train_only_class = split_xray_cv(xray_df, number_splits,
                                                           split, class_name)
        elif use_pascal:
            df_train, df_val, df_test = construct_train_test_cv(
                pascal_df, number_splits, split)
        else:
            df_train, df_val = split_data_cv(df_train_val,
                                             number_splits,
                                             split,
                                             random_seed=1,
                                             diagnose_col=class_name,
                                             ratio_to_keep=None)
            df_test = filter_rows_and_columns(test_df_all_classes, class_name)

        for curr_classifier in range(0, number_classifiers):
            if train_mode and split == CV_split_to_use:
                print("#####################################################")
                print("SPLIT :" + str(split))
                print("classifier #: " + str(curr_classifier))
                if use_xray:
                    class_train_subset = ld.get_train_subset_xray(
                        train_only_class,
                        df_bbox_train.shape[0],
                        random_seed=subset_seeds[curr_classifier],
                        ratio_to_keep=overlap_ratio)
                    print("new subset is :" + str(class_train_subset.shape))
                    df_train_subset = pd.concat(
                        [df_bbox_train, class_train_subset])
                    print(df_bbox_train.shape)
                    print(class_train_subset.shape)
                elif use_pascal:
                    df_train_subset = get_train_subset_mura(
                        df_train,
                        random_seed=subset_seeds[curr_classifier],
                        ratio_to_keep=overlap_ratio)
                else:
                    df_train_subset = get_train_subset_mura(
                        df_train,
                        random_seed=subset_seeds[curr_classifier],
                        ratio_to_keep=overlap_ratio)

                tf.keras.backend.clear_session()
                K.clear_session()

                ##O##O##_##O#O##_################################ TRAIN ###########################################################
                train_generator = gen.BatchGenerator(
                    instances=df_train_subset.values,
                    resized_image=resized_images_before_training,
                    batch_size=BATCH_SIZE,
                    net_h=IMAGE_SIZE,
                    net_w=IMAGE_SIZE,
                    norm=keras_utils.normalize,
                    box_size=BOX_SIZE,
                    processed_y=skip_processing,
                    interpolation=mura_interpolation,
                    shuffle=True)

                valid_generator = gen.BatchGenerator(
                    instances=df_val.values,
                    resized_image=resized_images_before_training,
                    batch_size=BATCH_SIZE,
                    net_h=IMAGE_SIZE,
                    net_w=IMAGE_SIZE,
                    box_size=BOX_SIZE,
                    norm=keras_utils.normalize,
                    processed_y=skip_processing,
                    interpolation=mura_interpolation,
                    shuffle=True)

                model = keras_model.build_model(reg_weight)
                model = keras_model.compile_model_accuracy(
                    model, lr, pooling_operator)
                lrate = LearningRateScheduler(keras_model.step_decay,
                                              verbose=1)

                filepath = trained_models_path + "CV_" + str(
                    split) + '_' + str(
                        curr_classifier) + "_-{epoch:02d}-{val_loss:.2f}.hdf5"
                checkpoint_on_epoch_end = ModelCheckpoint(filepath,
                                                          monitor='val_loss',
                                                          verbose=1,
                                                          save_best_only=False,
                                                          mode='min')

                print("df train STEPS")
                print(len(df_train) // BATCH_SIZE)
                print(train_generator.__len__())

                history = model.fit_generator(
                    generator=train_generator,
                    steps_per_epoch=train_generator.__len__(),
                    epochs=nr_epochs,
                    validation_data=valid_generator,
                    validation_steps=valid_generator.__len__(),
                    verbose=1)
                filepath = trained_models_path + 'subset_' + class_name + "_CV" + str(split) + '_' + str(
                    curr_classifier) + '_' + \
                           str(overlap_ratio) + ".hdf5"
                model.save(filepath)
                print("history")
                print(history.history)
                print(history.history['keras_accuracy'])
                np.save(
                    trained_models_path + 'train_info_' + str(split) + '_' +
                    str(curr_classifier) + '_' + str(overlap_ratio) + '.npy',
                    history.history)

                settings = np.array({
                    'lr: ': lr,
                    'reg_weight: ': reg_weight,
                    'pooling_operator: ': pooling_operator
                })
                np.save(trained_models_path + 'train_settings.npy', settings)

                keras_utils.plot_train_validation(
                    history.history['loss'], history.history['val_loss'],
                    'train loss', 'validation loss',
                    'CV_loss' + str(split) + str(curr_classifier), 'loss',
                    trained_models_path)

                ############################################    PREDICTIONS      #############################################
                ########################################### TRAINING SET########################################################
                predict_patch_and_save_results(
                    model,
                    'train_set_CV' + str(split) + '_' + str(curr_classifier),
                    df_train, skip_processing, BATCH_SIZE_TEST, BOX_SIZE,
                    IMAGE_SIZE, prediction_results_path, mura_interpolation,
                    resized_images_before_training)

                ########################################## VALIDATION SET######################################################
                predict_patch_and_save_results(
                    model,
                    'val_set_CV' + str(split) + '_' + str(curr_classifier),
                    df_val, skip_processing, BATCH_SIZE_TEST, BOX_SIZE,
                    IMAGE_SIZE, prediction_results_path, mura_interpolation,
                    resized_images_before_training)

                ########################################### TESTING SET########################################################
                predict_patch_and_save_results(
                    model,
                    'test_set_CV' + str(split) + '_' + str(curr_classifier),
                    df_test, skip_processing, BATCH_SIZE_TEST, BOX_SIZE,
                    IMAGE_SIZE, prediction_results_path, mura_interpolation,
                    resized_images_before_training)
            elif not train_mode:
                files_found = 0
                print(trained_models_path)
                for file_path in Path(trained_models_path).glob(
                        "subset_Cardiomegaly_CV1_" + str(curr_classifier) +
                        "*.hdf5"):
                    print(file_path)
                    files_found += 1

                assert files_found == 1, "No model found/ Multiple models found, not clear which to use "
                print(str(files_found))

                model = load_model(str(file_path),
                                   custom_objects={
                                       'keras_loss_v3_nor': keras_loss_v3_nor,
                                       'keras_accuracy': keras_accuracy,
                                       'accuracy_asloss': accuracy_asloss
                                   })

                model = keras_model.compile_model_accuracy(
                    model, lr, pooling_operator)

                predict_patch_and_save_results(
                    model, "train_set_CV" + str(split) + str(curr_classifier),
                    df_train, skip_processing, BATCH_SIZE_TEST, BOX_SIZE,
                    IMAGE_SIZE, prediction_results_path, mura_interpolation,
                    resized_images_before_training)
                predict_patch_and_save_results(
                    model, "val_set_CV" + str(split) + str(curr_classifier),
                    df_val, skip_processing, BATCH_SIZE_TEST, BOX_SIZE,
                    IMAGE_SIZE, prediction_results_path, mura_interpolation,
                    resized_images_before_training)
                predict_patch_and_save_results(
                    model, "test_set_CV" + str(split) + str(curr_classifier),
                    df_test, skip_processing, BATCH_SIZE_TEST, BOX_SIZE,
                    IMAGE_SIZE, prediction_results_path, mura_interpolation,
                    resized_images_before_training)
Esempio n. 5
0
BATCH_SIZE_TEST = 1
BOX_SIZE = 16

use_xray, use_pascal = set_dataset_flag(dataset_name)
script_suffix = 'exploratory_exp'
trained_models_path = build_path_results(results_path,
                                         dataset_name,
                                         pooling_operator,
                                         script_suffix=script_suffix,
                                         result_suffix='trained_models')
prediction_results_path = build_path_results(results_path,
                                             dataset_name,
                                             pooling_operator,
                                             script_suffix=script_suffix,
                                             result_suffix='predictions')
make_directory(trained_models_path)
make_directory(prediction_results_path)

if use_xray:
    if resized_images_before_training:
        xray_df = fetch_preprocessed_images_csv(image_path, 'processed_imgs')
        #todo: delete after testing
        # xray_df = xray_df[-50:]
    else:
        xray_df = ldd.load_process_xray14(config)
    df_train, df_val, df_test = ldd.split_filter_data(config, xray_df)

elif use_pascal:
    df_train, df_val, df_test = ldd.load_preprocess_pascal(config)
else:
    df_train, df_val, df_test = ldd.load_preprocess_mura(config)