Example #1
0
def train_transfer_learning(model_param,
                            df_train,
                            df_val,
                            num_classes,
                            class_weight_dict,
                            model_folder,
                            history_folder,
                            parameters,
                            k_split=0):
    # Create classifier
    classifier = TransferLearnClassifier(
        model_folder=model_folder,
        history_folder=history_folder,
        base_model_param=model_param,
        fc_layers=[512],  # e.g. [512]
        num_classes=num_classes,
        image_data_format=K.image_data_format(),
        metrics=[balanced_accuracy(num_classes), 'accuracy'],
        class_weight=class_weight_dict,
        image_paths_train=df_train['path'].tolist(),
        categories_train=utils.to_categorical(df_train['category'],
                                              num_classes=num_classes),
        image_paths_val=df_val['path'].tolist(),
        categories_val=utils.to_categorical(df_val['category'],
                                            num_classes=num_classes),
        parameters=parameters)
    print("Begin to train {}".format(model_param.class_name))
    classifier.train(k_split=k_split, workers=os.cpu_count())
    del classifier
    K.clear_session()
Example #2
0
def train_transfer_learning(base_model_params, df_train, df_val, num_classes, class_weight_dict, batch_size, max_queue_size, epoch_num, model_folder):
    workers = os.cpu_count()

    for model_param in base_model_params:
        classifier = TransferLearnClassifier(
            model_folder=model_folder,
            base_model_param=model_param,
            fc_layers=[512], # e.g. [512]
            num_classes=num_classes,
            dropout=0.3, # e.g. 0.3
            batch_size=batch_size,
            max_queue_size=max_queue_size,
            image_data_format=K.image_data_format(),
            metrics=[balanced_accuracy(num_classes), 'accuracy'],
            class_weight=class_weight_dict,
            image_paths_train=df_train['path'].tolist(),
            categories_train=np_utils.to_categorical(df_train['category'], num_classes=num_classes),
            image_paths_val=df_val['path'].tolist(),
            categories_val=np_utils.to_categorical(df_val['category'], num_classes=num_classes)
        )
        classifier.model.summary()
        print("Begin to train {}".format(model_param.class_name))
        classifier.train(epoch_num=epoch_num, workers=workers)
        del classifier
        K.clear_session()
Example #3
0
def compute_out_of_distribution_score(model_folder,
                                      df,
                                      num_classes,
                                      batch_size=32,
                                      temperature=2,
                                      magnitude=0.0002,
                                      delta=0.90385):
    model_filepath = os.path.join(model_folder,
                                  'DenseNet201_best_balanced_acc.hdf5')
    print('Loading model: ', model_filepath)
    model = load_model(
        filepath=model_filepath,
        custom_objects={'balanced_accuracy': balanced_accuracy(num_classes)})
    image_data_format = K.image_data_format()
    model_param_map = get_transfer_model_param_map()
    generator = ImageIterator(
        image_paths=df['path'].tolist(),
        labels=None,
        augmentation_pipeline=LesionClassifier.create_aug_pipeline_val(
            model_param_map['DenseNet201'].input_size),
        preprocessing_function=model_param_map['DenseNet201'].
        preprocessing_func,
        batch_size=batch_size,
        shuffle=False,
        rescale=None,
        pregen_augmented_images=False,
        data_format=image_data_format)

    compute_perturbations, get_scaled_dense_pred_output = get_perturbation_helper_func(
        model, temperature, num_classes)

    df_score = df[['image']].copy()
    softmax_scores = []
    learning_phase = 0  # 0 = test, 1 = train
    steps = math.ceil(df.shape[0] / batch_size)
    for _ in trange(steps):
        images = next(generator)
        perturbations = compute_perturbations([images, learning_phase])[0]
        # Get sign of perturbations
        perturbations = np.sign(perturbations)
        # DenseNet201 need normalization
        perturbations = norm_perturbations(perturbations, image_data_format)
        # Add perturbations to images
        perturbative_images = images - magnitude * perturbations
        # Calculate the confidence after adding perturbations
        dense_pred_outputs = get_scaled_dense_pred_output(
            [perturbative_images, learning_phase])[0]
        softmax_probs = softmax(dense_pred_outputs)
        softmax_scores.extend(np.max(softmax_probs, axis=-1).tolist())

    del model
    K.clear_session()

    df_score['softmax_score'] = softmax_scores
    df_score['out_dist_score'] = 1 - logistic(
        x=df_score['softmax_score'], x0=delta, k=20)
    return df_score
Example #4
0
def train_vanilla(df_train, df_val, num_classes, class_weight_dict, batch_size, max_queue_size, epoch_num, input_size, model_folder):
    workers = os.cpu_count()

    classifier = VanillaClassifier(
        model_folder=model_folder,
        input_size=input_size,
        image_data_format=K.image_data_format(),
        num_classes=num_classes,
        batch_size=batch_size,
        max_queue_size=max_queue_size,
        class_weight=class_weight_dict,
        metrics=[balanced_accuracy(num_classes), 'accuracy'],
        image_paths_train=df_train['path'].tolist(),
        categories_train=np_utils.to_categorical(df_train['category'], num_classes=num_classes),
        image_paths_val=df_val['path'].tolist(),
        categories_val=np_utils.to_categorical(df_val['category'], num_classes=num_classes)
    )
    classifier.model.summary()
    print('Begin to train Vanilla CNN')
    classifier.train(epoch_num=epoch_num, workers=workers)
    del classifier
    K.clear_session()
Example #5
0
def predidct_test(model_folder,
                  test_image_folder,
                  pred_result_folder_test,
                  models_to_predict,
                  category_names,
                  unknown_method,
                  unknown_category,
                  postfix,
                  parameters,
                  k_folds=0):
    os.makedirs(pred_result_folder_test, exist_ok=True)
    df_test = get_dataframe_from_img_folder(test_image_folder,
                                            has_path_col=True)
    df_test.drop(columns=['path']).to_csv(os.path.join(pred_result_folder_test,
                                                       'ISIC_2019_Test.csv'),
                                          index=False)

    hyperparameter_str = formated_hyperparameters(parameters)
    print(hyperparameter_str)

    model_kfold_folders = range(k_folds)
    if k_folds == 0:
        model_kfold_folders = [0]

    for model_to_predict in models_to_predict:
        for k_fold in model_kfold_folders:
            model_filepath = os.path.join(model_folder,
                                          model_to_predict.class_name,
                                          hyperparameter_str,
                                          "{}.hdf5".format(postfix))

            if not os.path.exists(model_filepath):
                model_filepath = os.path.join(model_folder,
                                              model_to_predict.class_name,
                                              hyperparameter_str, str(k_fold),
                                              "{}.hdf5".format(postfix))

            if os.path.exists(model_filepath):
                print(
                    "===== Predict test data using \"{}_{}\" with \"{}\" model ====="
                    .format(model_to_predict.class_name, k_fold, postfix))

                model = load_model(filepath=model_filepath,
                                   custom_objects={
                                       'balanced_accuracy':
                                       balanced_accuracy(len(category_names))
                                   })

                df_softmax = LesionClassifier.predict_dataframe(
                    model=model,
                    df=df_test,
                    category_names=category_names,
                    augmentation_pipeline=LesionClassifier.create_aug_pipeline(
                        0, model_to_predict.input_size, True),
                    preprocessing_function=model_to_predict.preprocessing_func,
                    batch_size=parameters.batch_size,
                    workers=os.cpu_count(),
                )

                df_softmax = handle_unknown(
                    model,
                    model_to_predict,
                    parameters,
                    df_softmax,
                    df_test,
                    unknown_method,
                    prediction_label=str(k_fold),
                    pred_result_folder_test=pred_result_folder_test)

                #del model
                K.clear_session()
            else:
                print("\"{}\" doesn't exist".format(model_filepath))
                return

        if k_folds > 0:
            print(
                "===== Ensembling predictions from {} k folds of {}\" model ====="
                .format(k_folds, model_to_predict.class_name))
            # Ensemble Models' k fold predictions
            df_softmax = ensemble_predictions_k_fold(
                result_folder=pred_result_folder_test,
                parameters=parameters,
                category_names=category_names,
                model_name=model_to_predict.class_name,
                postfix=postfix,
                k_folds=k_folds)

        # Save ensemble predictions
        save_prediction_results(
            df_softmax,
            model_to_predict.class_name,
            pred_result_folder_test=pred_result_folder_test,
            parameters=parameters,
            postfix=postfix)

    if (len(models_to_predict) > 1):
        model_names = [model.class_name for model in models_to_predict]

        print(
            "===== Ensembling predictions using from {} models using {} =====".
            format(model_names, postfix))

        # Ensemble Models' Predictions on Test Data
        df_ensemble = ensemble_predictions(
            result_folder=pred_result_folder_test,
            parameters=parameters,
            category_names=category_names,
            model_names=model_names,
            postfix=postfix)

        # Save ensemble predictions
        save_prediction_results(
            df_ensemble,
            "-".join([
                f"{model_name}_{k_folds}" for model_name in sorted(model_names)
            ]),
            pred_result_folder_test=pred_result_folder_test)
Example #6
0
def compute_odin_softmax_scores(in_dist_pred_result_folder,
                                in_dist_image_folder,
                                out_dist_pred_result_folder,
                                out_dist_image_folder, model_folder,
                                softmax_score_folder, num_classes, batch_size):
    """ Calculate softmax scores for different combinations of ODIN parameters. """
    print('Begin to compute ODIN softmax scores')
    model_names = ['DenseNet201', 'Xception', 'ResNeXt50']
    # postfixes = ['best_balanced_acc', 'best_loss', 'latest']
    postfixes = ['best_balanced_acc']
    distributions = ['In', 'Out']

    # This file is used for recording what parameter combinations were already computed.
    progress_file = os.path.join(softmax_score_folder, 'Done.txt')
    done_set = set()
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            done_set = set(line.rstrip('\n') for line in f)

    # ODIN parameters
    temperatures = [1000, 500, 200, 100, 50, 20, 10, 5, 2, 1]
    magnitudes = np.round(np.arange(0, 0.0041, 0.0002), 4)

    model_param_map = get_transfer_model_param_map()
    image_data_format = K.image_data_format()
    learning_phase = 0  # 0 = test, 1 = train

    for modelattr in (ModelAttr(x, y) for x in model_names for y in postfixes):
        # In-distribution data
        df = {}
        df['In'] = pd.read_csv(
            os.path.join(
                in_dist_pred_result_folder,
                "{}_{}.csv".format(modelattr.model_name, modelattr.postfix)))
        df['In']['path'] = df['In'].apply(lambda row: os.path.join(
            in_dist_image_folder, row['image'] + '.jpg'),
                                          axis=1)
        generator_in = ImageIterator(
            image_paths=df['In']['path'].tolist(),
            labels=None,
            augmentation_pipeline=LesionClassifier.create_aug_pipeline_val(
                model_param_map[modelattr.model_name].input_size),
            preprocessing_function=model_param_map[
                modelattr.model_name].preprocessing_func,
            batch_size=batch_size,
            shuffle=False,
            rescale=None,
            pregen_augmented_images=True,
            data_format=image_data_format)

        # Out-distribution data
        df['Out'] = pd.read_csv(
            os.path.join(
                out_dist_pred_result_folder,
                "{}_{}.csv".format(modelattr.model_name, modelattr.postfix)))
        df['Out']['path'] = df['Out'].apply(lambda row: os.path.join(
            out_dist_image_folder, row['image'] + '.jpg'),
                                            axis=1)
        generator_out = ImageIterator(
            image_paths=df['Out']['path'].tolist(),
            labels=None,
            augmentation_pipeline=LesionClassifier.create_aug_pipeline_val(
                model_param_map[modelattr.model_name].input_size),
            preprocessing_function=model_param_map[
                modelattr.model_name].preprocessing_func,
            batch_size=batch_size,
            shuffle=False,
            rescale=None,
            pregen_augmented_images=True,
            data_format=image_data_format)

        # Load model
        model_filepath = os.path.join(
            model_folder, "{}_{}.hdf5".format(modelattr.model_name,
                                              modelattr.postfix))
        print('Loading model: ', model_filepath)
        model = load_model(filepath=model_filepath,
                           custom_objects={
                               'balanced_accuracy':
                               balanced_accuracy(num_classes)
                           })
        need_norm_perturbations = (modelattr.model_name == 'DenseNet201'
                                   or modelattr.model_name == 'ResNeXt50')

        for temperature in temperatures:
            compute_perturbations, get_scaled_dense_pred_output = get_perturbation_helper_func(
                model, temperature, num_classes)

            for magnitude in magnitudes:
                for dist in distributions:
                    # Skip if the parameter combination has done
                    param_comb_id = "{}_{}, {}, {}, {}".format(
                        modelattr.model_name, modelattr.postfix, dist,
                        temperature, magnitude)
                    if param_comb_id in done_set:
                        print('Skip ', param_comb_id)
                        continue

                    generator = generator_in if dist == 'In' else generator_out

                    print(
                        "\n===== Temperature: {}, Magnitude: {}, {}-Distribution ====="
                        .format(temperature, magnitude, dist))
                    softmax_score_sub_folder = os.path.join(
                        softmax_score_folder,
                        "{}_{}".format(temperature, magnitude))
                    os.makedirs(softmax_score_sub_folder, exist_ok=True)

                    steps = math.ceil(df[dist].shape[0] / batch_size)
                    generator.reset()
                    f = open(
                        os.path.join(
                            softmax_score_sub_folder,
                            "{}_{}_ODIN_{}.txt".format(modelattr.model_name,
                                                       modelattr.postfix,
                                                       dist)), 'w')
                    for _ in trange(steps):
                        images = next(generator)
                        perturbations = compute_perturbations(
                            [images, learning_phase])[0]
                        # Get sign of perturbations
                        perturbations = np.sign(perturbations)

                        # Normalize the perturbations to the same space of image
                        # https://github.com/facebookresearch/odin/issues/5
                        # Perturbations divided by ISIC Training Set STD
                        if need_norm_perturbations:
                            perturbations = norm_perturbations(
                                perturbations, image_data_format)

                        # Add perturbations to images
                        perturbative_images = images - magnitude * perturbations

                        # Calculate the confidence after adding perturbations
                        dense_pred_outputs = get_scaled_dense_pred_output(
                            [perturbative_images, learning_phase])[0]
                        softmax_probs = softmax(dense_pred_outputs)
                        softmax_scores = np.max(softmax_probs, axis=-1)
                        for s in softmax_scores:
                            f.write("{}\n".format(s))
                    f.close()

                    with open(progress_file, 'a') as f_done:
                        f_done.write("{}\n".format(param_comb_id))
        del model
        K.clear_session()
Example #7
0
def main():
    parser = argparse.ArgumentParser(description='ISIC-2019 Skin Lesion Classifiers')
    parser.add_argument('data', metavar='DIR', help='path to data folder')
    parser.add_argument('--batchsize', type=int, help='Batch size (default: %(default)s)', default=32)
    parser.add_argument('--maxqueuesize', type=int, help='Maximum size for the generator queue (default: %(default)s)', default=10)
    parser.add_argument('--epoch', type=int, help='Number of epochs (default: %(default)s)', default=100)
    parser.add_argument('--model', dest='models', nargs='*', choices=['Vanilla', 'DenseNet201', 'Xception', 'ResNeXt50', 'NASNetLarge', 'InceptionResNetV2'], help='Models')
    parser.add_argument('--autoshutdown', dest='autoshutdown', action='store_true', help='Automatically shutdown the computer after everything is done.')
    parser.add_argument('--training', dest='training', action='store_true', help='Train models')
    parser.add_argument('--predval', dest='predval', action='store_true', help='Predict validation set')
    parser.add_argument('--predtest', dest='predtest', action='store_true', help='Predict the test data which contains 8238 JPEG images of skin lesions.')
    parser.add_argument('--predvalresultfolder', help='Name of the prediction result folder for validation set (default: %(default)s)', default='val_predict_results')
    parser.add_argument('--predtestresultfolder', help='Name of the prediction result folder for test data (default: %(default)s)', default='test_predict_results')
    parser.add_argument('--modelfolder', help='Name of the model folder (default: %(default)s)', default='models')
    parser.add_argument('--odinscore', dest='odinscore', action='store_true', help='Only relevant if approach is 1. Computing Baseline and ODIN softmax scores.')
    parser.add_argument('--approach', type=int, choices=range(1, 3), required=True, help='Approach for training the models')
    args = parser.parse_args()
    print(args)

    # Write command to a file
    with open('Cmd_History.txt', 'a') as f:
        f.write("{}\t{}\n".format(str(datetime.datetime.utcnow()), str(args)))

    data_folder = args.data
    pred_result_folder_val = args.predvalresultfolder
    pred_result_folder_test = args.predtestresultfolder
    model_folder = args.modelfolder
    softmax_score_folder = 'softmax_scores'
    batch_size = args.batchsize
    max_queue_size = args.maxqueuesize
    epoch_num = args.epoch
    approach = args.approach

    # ISIC data
    training_image_folder = os.path.join(data_folder, 'ISIC_2019_Training_Input')
    test_image_folder = os.path.join(data_folder, 'ISIC_2019_Test_Input')

    # Out-of-distribution data
    out_dist_image_folder = os.path.join(data_folder, 'Out_Distribution')
    out_dist_pred_result_folder = 'out_dist_predict_results'

    # Ground truth of different approaches
    if approach == 1:
        ground_truth_file = os.path.join(data_folder, 'ISIC_2019_Training_GroundTruth.csv')
        df_ground_truth, known_category_names, unknown_category_name = load_isic_training_data(training_image_folder, ground_truth_file)
        category_names = known_category_names
    elif approach == 2:
        ground_truth_file = os.path.join(data_folder, 'ISIC_2019_Training_GroundTruth_DuplicateRemoved.csv')
        df_ground_truth, category_names = load_isic_training_and_out_dist_data(training_image_folder, ground_truth_file, out_dist_image_folder)
    else:
        print('Unknown appraoch:', approach)
        return
    df_train, df_val = train_validation_split(df_ground_truth)
    class_weight_dict, _ = compute_class_weight_dict(df_train)
    category_num = len(category_names)

    # Models used for predictition
    models_to_predict = []
    workers = os.cpu_count()

    # Train Vanilla CNN
    if args.models is not None and 'Vanilla' in args.models:
        input_size_vanilla = (224, 224)
        if args.training:
            train_vanilla(df_train, df_val, category_num, class_weight_dict, batch_size, max_queue_size, epoch_num, input_size_vanilla, model_folder)
        models_to_predict.append({'model_name': 'Vanilla',
                                  'input_size': input_size_vanilla,
                                  'preprocessing_function': VanillaClassifier.preprocess_input})
    
    transfer_models = args.models.copy()
    if 'Vanilla' in transfer_models:
        transfer_models.remove('Vanilla')
    
    # Train models by Transfer Learning
    if args.models is not None:
        model_param_map = get_transfer_model_param_map() if approach == 1 else get_transfer_model_param_map_2()
        base_model_params = [model_param_map[x] for x in transfer_models]
        if args.training:
            train_transfer_learning(base_model_params, df_train, df_val, category_num, class_weight_dict, batch_size, max_queue_size, epoch_num, model_folder)
        for base_model_param in base_model_params:
            models_to_predict.append({'model_name': base_model_param.class_name,
                                      'input_size': base_model_param.input_size,
                                      'preprocessing_function': base_model_param.preprocessing_func})

    # Predict validation set
    if args.predval:
        os.makedirs(pred_result_folder_val, exist_ok=True)
        # Save Ground Truth of validation set
        val_ground_truth_file_path = os.path.join(pred_result_folder_val, 'Validation_Set_GroundTruth.csv')
        df_val.drop(columns=['path', 'category']).to_csv(path_or_buf=val_ground_truth_file_path, index=False)
        print("Save \"{}\"".format(val_ground_truth_file_path))

        postfixes = ['best_balanced_acc', 'best_loss', 'latest']
        for postfix in postfixes:
            for m in models_to_predict:
                model_filepath = os.path.join(model_folder, "{}_{}.hdf5".format(m['model_name'], postfix))
                if os.path.exists(model_filepath):
                    print("===== Predict validation set using \"{}_{}\" model =====".format(m['model_name'], postfix))
                    model = load_model(filepath=model_filepath, custom_objects={'balanced_accuracy': balanced_accuracy(category_num)})
                    LesionClassifier.predict_dataframe(model=model, df=df_val,
                                                       category_names=category_names,
                                                       augmentation_pipeline=LesionClassifier.create_aug_pipeline_val(m['input_size']),
                                                       preprocessing_function=m['preprocessing_function'],
                                                       batch_size=batch_size,
                                                       workers=workers,
                                                       softmax_save_file_name=os.path.join(pred_result_folder_val, "{}_{}.csv").format(m['model_name'], postfix),
                                                       logit_save_file_name=os.path.join(pred_result_folder_val, "{}_{}_logit.csv").format(m['model_name'], postfix))
                    del model
                    K.clear_session()
                else:
                    print("\"{}\" doesn't exist".format(model_filepath))

    # Predict Test Data
    if args.predtest:
        os.makedirs(pred_result_folder_test, exist_ok=True)
        df_test = get_dataframe_from_img_folder(test_image_folder, has_path_col=True)
        df_test.drop(columns=['path']).to_csv(os.path.join(pred_result_folder_test, 'ISIC_2019_Test.csv'), index=False)
        postfix = 'best_balanced_acc'
        for m in models_to_predict:
            model_filepath = os.path.join(model_folder, "{}_{}.hdf5".format(m['model_name'], postfix))
            if os.path.exists(model_filepath):
                print("===== Predict test data using \"{}_{}\" model =====".format(m['model_name'], postfix))
                model = load_model(filepath=model_filepath, custom_objects={'balanced_accuracy': balanced_accuracy(category_num)})
                LesionClassifier.predict_dataframe(model=model, df=df_test,
                                                   category_names=category_names,
                                                   augmentation_pipeline=LesionClassifier.create_aug_pipeline_val(m['input_size']),
                                                   preprocessing_function=m['preprocessing_function'],
                                                   batch_size=batch_size,
                                                   workers=workers,
                                                   softmax_save_file_name=os.path.join(pred_result_folder_test, "{}_{}.csv").format(m['model_name'], postfix),
                                                   logit_save_file_name=os.path.join(pred_result_folder_test, "{}_{}_logit.csv").format(m['model_name'], postfix))
                del model
                K.clear_session()
            else:
                print("\"{}\" doesn't exist".format(model_filepath))

        # Ensemble Models' Predictions on Test Data
        df_ensemble = ensemble_predictions(result_folder=pred_result_folder_test, category_names=category_names, save_file=False,
                                           model_names=transfer_models, postfixes=[postfix]).drop(columns=['pred_category'])
        if approach == 1:
            # Compute Out-of-Distribution scores
            df_score = compute_out_of_distribution_score(model_folder=model_folder, df=df_test, num_classes=category_num, batch_size=batch_size)
            # Merge ensemble predictions with out-of-Distribution scores
            df_ensemble[unknown_category_name] = df_score['out_dist_score']
        df_ensemble.to_csv(os.path.join(pred_result_folder_test, "Ensemble_{}.csv".format(postfix)), index=False)

    # Compute Baseline and ODIN Softmax Scores
    if args.odinscore and approach == 1:
        os.makedirs(softmax_score_folder, exist_ok=True)
        compute_baseline_softmax_scores(in_dist_pred_result_folder=pred_result_folder_val,
                                        out_dist_pred_result_folder=out_dist_pred_result_folder,
                                        softmax_score_folder=softmax_score_folder)
                                        
        compute_odin_softmax_scores(in_dist_pred_result_folder=pred_result_folder_val, in_dist_image_folder=training_image_folder,
                                    out_dist_pred_result_folder=out_dist_pred_result_folder, out_dist_image_folder=out_dist_image_folder,
                                    model_folder=model_folder, softmax_score_folder=softmax_score_folder,
                                    num_classes=category_num, batch_size=batch_size)

    # Shutdown
    if args.autoshutdown:
        # Shutdown after 2 minutes
        os.system("sudo shutdown -h +2")
def evaluate_model(dataset,
                   save_file,
                   random_state,
                   clf,
                   clf_name,
                   hyper_params,
                   longitudinal=False,
                   rare=True):

    print('reading data...', end='')
    features, labels, pt_ids, feature_names, zfile = read_file(
        dataset, longitudinal, rare)
    print('done.', len(labels), 'samples,', np.sum(labels == 1), 'cases,',
          features.shape[1], 'features')
    if 'Feat' in clf_name:
        #set feature names
        clf.feature_names = ','.join(feature_names).encode()
    n_splits = 10
    cv = StratifiedKFold(n_splits=n_splits,
                         shuffle=True,
                         random_state=random_state)

    scoring = make_scorer(balanced_accuracy)

    ###
    # controls matching on age and sex
    ###
    idx_age = np.argmax(feature_names == 'age')
    idx_sex = np.argmax(feature_names == 'SEX')

    #sampler = NearMiss(random_state=random_state, return_indices=True)
    sampler = QuartileExactMatch(quart_locs=[idx_age],
                                 exact_locs=[idx_sex],
                                 random_state=random_state)

    print('sampling data...', end='')
    X, y, sidx = sampler.fit_sample(features, labels)
    print('sampled data contains', np.sum(y == 1), 'cases', np.sum(y == 0),
          'controls')
    ###
    # split into train/test
    ###
    X_train, X_test, y_train, y_test, sidx_train, sidx_test = (
        train_test_split(X,
                         y,
                         sidx,
                         train_size=0.5,
                         test_size=0.5,
                         random_state=random_state))

    # X,y,sidx = sampler.fit_sample(features[train_idx],labels[train_idx])
    if len(hyper_params) > 0:
        param_grid = list(ParameterGrid(hyper_params))
        #clone estimators
        Clfs = [clone(clf).set_params(**p) for p in param_grid]
        # fit with hyperparameter optimization
        cv_scores = np.zeros((len(param_grid), 10))  # cross validated scores
        cv_preds = np.zeros(
            (len(param_grid), len(y_train)))  # cross validated predictions
        cv_probs = np.zeros(
            (len(param_grid), len(y_train)))  # cross validated probabilities
        FI = np.zeros((
            len(param_grid),
            features.shape[1]))  # cross validated, permuted feature importance
        FI_internal = np.zeros(
            (len(param_grid),
             features.shape[1]))  # cross validated feature importance

        ###########
        # this is a manual version of 10-fold cross validation with hyperparameter tuning
        t0 = time.process_time()
        for j, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
            print('fold', j)

            for i, est in enumerate(Clfs):
                print('training', type(est).__name__, i + 1, 'of', len(Clfs))
                if 'Feat' in clf_name:
                    est.logfile = (est.logfile.decode().split('.log')[0] +
                                   '.log.param' + str(i) + '.cv' +
                                   str(j)).encode()
                ##########
                # fit model
                ##########
                if longitudinal:
                    est.fit(X_train[train_idx], y_train[train_idx], zfile,
                            pt_ids[sidx_train[train_idx]])
                else:
                    est.fit(X_train[train_idx], y_train[train_idx])

                ##########
                # get predictions
                ##########
                print('getting validation predictions...')
                if longitudinal:
                    # cv_preds[i,val_idx] = est.predict(X_train[val_idx],
                    #                                    zfile,pt_ids[sidx_train[train_idx]])
                    if getattr(clf, "predict_proba", None):
                        cv_probs[i, val_idx] = est.predict_proba(
                            X_train[val_idx], zfile,
                            pt_ids[sidx_train[train_idx]])[:, 1]
                    elif getattr(clf, "decision_function", None):
                        cv_probs[i, val_idx] = est.decision_function(
                            X_train[val_idx], zfile,
                            pt_ids[sidx_train[train_idx]])
                else:
                    # cv_preds[i,val_idx] = est.predict(X_train[val_idx])
                    if getattr(clf, "predict_proba", None):
                        cv_probs[i, val_idx] = est.predict_proba(
                            X_train[val_idx])[:, 1]
                    elif getattr(clf, "decision_function", None):
                        cv_probs[i, val_idx] = est.decision_function(
                            X_train[val_idx])

                ##########
                # scores
                ##########
                cv_scores[i, j] = roc_auc_score(y_train[val_idx],
                                                cv_probs[i, val_idx])

        runtime = time.process_time() - t0
        ###########

        print('gridsearch finished in', runtime, 'seconds')

        ##########
        # get best model and its information
        mean_cv_scores = [np.mean(s) for s in cv_scores]
        best_clf = Clfs[np.argmax(mean_cv_scores)]
        ##########
    else:
        print('skipping hyperparameter tuning')
        best_clf = clf  # this option is for skipping model tuning
        t0 = time.process_time()

    print('fitting tuned model to all training data...')
    if longitudinal:
        best_clf.fit(X_train, y_train, zfile, pt_ids[sidx_train])
    else:
        best_clf.fit(X_train, y_train)

    if len(hyper_params) == 0:
        runtime = time.process_time() - t0
    # cv_predictions = cv_preds[np.argmax(mean_cv_scores)]
    # cv_probabilities = cv_probs[np.argmax(mean_cv_scores)]
    if not longitudinal:
        # internal feature importances
        cv_FI_int = compute_imp_score(best_clf,
                                      clf_name,
                                      X_train,
                                      y_train,
                                      random_state,
                                      perm=False)
        # cv_FI_int = FI_internal[np.argmax(mean_cv_scores)]
        # permutation importances
        FI = compute_imp_score(best_clf,
                               clf_name,
                               X_test,
                               y_test,
                               random_state,
                               perm=True)

    ##########
    # metrics: test the best classifier on the held-out test set
    print('getting test predictions...')
    if longitudinal:

        print('best_clf.predict(X_test, zfile, pt_ids[sidx_test])')
        test_predictions = best_clf.predict(X_test, zfile, pt_ids[sidx_test])
        if getattr(clf, "predict_proba", None):
            print('best_clf.predict_proba(X_test, zfile, pt_ids[sidx_test])')
            test_probabilities = best_clf.predict_proba(
                X_test, zfile, pt_ids[sidx_test])[:, 1]
        elif getattr(clf, "decision_function", None):
            test_probabilities = best_clf.decision_function(
                X_test, zfile, pt_ids[sidx_test])
    else:
        test_predictions = best_clf.predict(X_test)
        if getattr(clf, "predict_proba", None):
            test_probabilities = best_clf.predict_proba(X_test)[:, 1]
        elif getattr(clf, "decision_function", None):
            test_probabilities = best_clf.decision_function(X_test)

    # # write cv_pred and cv_prob to file
    # df = pd.DataFrame({'cv_prediction':cv_predictions,'cv_probability':cv_probabilities,
    #                    'pt_id':pt_ids})
    # df.to_csv(save_file.split('.csv')[0] + '_' + str(random_state) + '.cv_predictions',index=None)
    accuracy = accuracy_score(y_test, test_predictions)
    macro_f1 = f1_score(y_test, test_predictions, average='macro')
    bal_acc = balanced_accuracy(y_test, test_predictions)
    roc_auc = roc_auc_score(y_test, test_probabilities)

    ##########
    # save results to file
    print('saving results...')
    param_string = ','.join([
        '{}={}'.format(p, v) for p, v in best_clf.get_params().items()
        if p != 'feature_names'
    ]).replace('\n', '').replace(' ', '')

    out_text = '\t'.join([
        dataset.split('/')[-1], clf_name, param_string,
        str(random_state),
        str(accuracy),
        str(macro_f1),
        str(bal_acc),
        str(roc_auc),
        str(runtime)
    ])
    print(out_text)
    with open(save_file, 'a') as out:
        out.write(out_text + '\n')
    sys.stdout.flush()

    print('saving feature importance')
    # write feature importances
    if not longitudinal:
        feature_importance(save_file,
                           best_clf,
                           feature_names,
                           X_test,
                           y_test,
                           random_state,
                           clf_name,
                           param_string,
                           cv_FI_int,
                           perm=False)
        feature_importance(save_file,
                           best_clf,
                           feature_names,
                           X_test,
                           y_test,
                           random_state,
                           clf_name,
                           param_string,
                           FI,
                           perm=True)
    # write roc curves
    print('saving roc')
    roc(save_file, y_test, test_probabilities, random_state, clf_name,
        param_string)

    return best_clf