def train_transfer_learning(model_param, df_train, df_val, num_classes, class_weight_dict, model_folder, history_folder, parameters, k_split=0): # Create classifier classifier = TransferLearnClassifier( model_folder=model_folder, history_folder=history_folder, base_model_param=model_param, fc_layers=[512], # e.g. [512] num_classes=num_classes, image_data_format=K.image_data_format(), metrics=[balanced_accuracy(num_classes), 'accuracy'], class_weight=class_weight_dict, image_paths_train=df_train['path'].tolist(), categories_train=utils.to_categorical(df_train['category'], num_classes=num_classes), image_paths_val=df_val['path'].tolist(), categories_val=utils.to_categorical(df_val['category'], num_classes=num_classes), parameters=parameters) print("Begin to train {}".format(model_param.class_name)) classifier.train(k_split=k_split, workers=os.cpu_count()) del classifier K.clear_session()
def train_transfer_learning(base_model_params, df_train, df_val, num_classes, class_weight_dict, batch_size, max_queue_size, epoch_num, model_folder): workers = os.cpu_count() for model_param in base_model_params: classifier = TransferLearnClassifier( model_folder=model_folder, base_model_param=model_param, fc_layers=[512], # e.g. [512] num_classes=num_classes, dropout=0.3, # e.g. 0.3 batch_size=batch_size, max_queue_size=max_queue_size, image_data_format=K.image_data_format(), metrics=[balanced_accuracy(num_classes), 'accuracy'], class_weight=class_weight_dict, image_paths_train=df_train['path'].tolist(), categories_train=np_utils.to_categorical(df_train['category'], num_classes=num_classes), image_paths_val=df_val['path'].tolist(), categories_val=np_utils.to_categorical(df_val['category'], num_classes=num_classes) ) classifier.model.summary() print("Begin to train {}".format(model_param.class_name)) classifier.train(epoch_num=epoch_num, workers=workers) del classifier K.clear_session()
def compute_out_of_distribution_score(model_folder, df, num_classes, batch_size=32, temperature=2, magnitude=0.0002, delta=0.90385): model_filepath = os.path.join(model_folder, 'DenseNet201_best_balanced_acc.hdf5') print('Loading model: ', model_filepath) model = load_model( filepath=model_filepath, custom_objects={'balanced_accuracy': balanced_accuracy(num_classes)}) image_data_format = K.image_data_format() model_param_map = get_transfer_model_param_map() generator = ImageIterator( image_paths=df['path'].tolist(), labels=None, augmentation_pipeline=LesionClassifier.create_aug_pipeline_val( model_param_map['DenseNet201'].input_size), preprocessing_function=model_param_map['DenseNet201']. preprocessing_func, batch_size=batch_size, shuffle=False, rescale=None, pregen_augmented_images=False, data_format=image_data_format) compute_perturbations, get_scaled_dense_pred_output = get_perturbation_helper_func( model, temperature, num_classes) df_score = df[['image']].copy() softmax_scores = [] learning_phase = 0 # 0 = test, 1 = train steps = math.ceil(df.shape[0] / batch_size) for _ in trange(steps): images = next(generator) perturbations = compute_perturbations([images, learning_phase])[0] # Get sign of perturbations perturbations = np.sign(perturbations) # DenseNet201 need normalization perturbations = norm_perturbations(perturbations, image_data_format) # Add perturbations to images perturbative_images = images - magnitude * perturbations # Calculate the confidence after adding perturbations dense_pred_outputs = get_scaled_dense_pred_output( [perturbative_images, learning_phase])[0] softmax_probs = softmax(dense_pred_outputs) softmax_scores.extend(np.max(softmax_probs, axis=-1).tolist()) del model K.clear_session() df_score['softmax_score'] = softmax_scores df_score['out_dist_score'] = 1 - logistic( x=df_score['softmax_score'], x0=delta, k=20) return df_score
def train_vanilla(df_train, df_val, num_classes, class_weight_dict, batch_size, max_queue_size, epoch_num, input_size, model_folder): workers = os.cpu_count() classifier = VanillaClassifier( model_folder=model_folder, input_size=input_size, image_data_format=K.image_data_format(), num_classes=num_classes, batch_size=batch_size, max_queue_size=max_queue_size, class_weight=class_weight_dict, metrics=[balanced_accuracy(num_classes), 'accuracy'], image_paths_train=df_train['path'].tolist(), categories_train=np_utils.to_categorical(df_train['category'], num_classes=num_classes), image_paths_val=df_val['path'].tolist(), categories_val=np_utils.to_categorical(df_val['category'], num_classes=num_classes) ) classifier.model.summary() print('Begin to train Vanilla CNN') classifier.train(epoch_num=epoch_num, workers=workers) del classifier K.clear_session()
def predidct_test(model_folder, test_image_folder, pred_result_folder_test, models_to_predict, category_names, unknown_method, unknown_category, postfix, parameters, k_folds=0): os.makedirs(pred_result_folder_test, exist_ok=True) df_test = get_dataframe_from_img_folder(test_image_folder, has_path_col=True) df_test.drop(columns=['path']).to_csv(os.path.join(pred_result_folder_test, 'ISIC_2019_Test.csv'), index=False) hyperparameter_str = formated_hyperparameters(parameters) print(hyperparameter_str) model_kfold_folders = range(k_folds) if k_folds == 0: model_kfold_folders = [0] for model_to_predict in models_to_predict: for k_fold in model_kfold_folders: model_filepath = os.path.join(model_folder, model_to_predict.class_name, hyperparameter_str, "{}.hdf5".format(postfix)) if not os.path.exists(model_filepath): model_filepath = os.path.join(model_folder, model_to_predict.class_name, hyperparameter_str, str(k_fold), "{}.hdf5".format(postfix)) if os.path.exists(model_filepath): print( "===== Predict test data using \"{}_{}\" with \"{}\" model =====" .format(model_to_predict.class_name, k_fold, postfix)) model = load_model(filepath=model_filepath, custom_objects={ 'balanced_accuracy': balanced_accuracy(len(category_names)) }) df_softmax = LesionClassifier.predict_dataframe( model=model, df=df_test, category_names=category_names, augmentation_pipeline=LesionClassifier.create_aug_pipeline( 0, model_to_predict.input_size, True), preprocessing_function=model_to_predict.preprocessing_func, batch_size=parameters.batch_size, workers=os.cpu_count(), ) df_softmax = handle_unknown( model, model_to_predict, parameters, df_softmax, df_test, unknown_method, prediction_label=str(k_fold), pred_result_folder_test=pred_result_folder_test) #del model K.clear_session() else: print("\"{}\" doesn't exist".format(model_filepath)) return if k_folds > 0: print( "===== Ensembling predictions from {} k folds of {}\" model =====" .format(k_folds, model_to_predict.class_name)) # Ensemble Models' k fold predictions df_softmax = ensemble_predictions_k_fold( result_folder=pred_result_folder_test, parameters=parameters, category_names=category_names, model_name=model_to_predict.class_name, postfix=postfix, k_folds=k_folds) # Save ensemble predictions save_prediction_results( df_softmax, model_to_predict.class_name, pred_result_folder_test=pred_result_folder_test, parameters=parameters, postfix=postfix) if (len(models_to_predict) > 1): model_names = [model.class_name for model in models_to_predict] print( "===== Ensembling predictions using from {} models using {} =====". format(model_names, postfix)) # Ensemble Models' Predictions on Test Data df_ensemble = ensemble_predictions( result_folder=pred_result_folder_test, parameters=parameters, category_names=category_names, model_names=model_names, postfix=postfix) # Save ensemble predictions save_prediction_results( df_ensemble, "-".join([ f"{model_name}_{k_folds}" for model_name in sorted(model_names) ]), pred_result_folder_test=pred_result_folder_test)
def compute_odin_softmax_scores(in_dist_pred_result_folder, in_dist_image_folder, out_dist_pred_result_folder, out_dist_image_folder, model_folder, softmax_score_folder, num_classes, batch_size): """ Calculate softmax scores for different combinations of ODIN parameters. """ print('Begin to compute ODIN softmax scores') model_names = ['DenseNet201', 'Xception', 'ResNeXt50'] # postfixes = ['best_balanced_acc', 'best_loss', 'latest'] postfixes = ['best_balanced_acc'] distributions = ['In', 'Out'] # This file is used for recording what parameter combinations were already computed. progress_file = os.path.join(softmax_score_folder, 'Done.txt') done_set = set() if os.path.exists(progress_file): with open(progress_file, 'r') as f: done_set = set(line.rstrip('\n') for line in f) # ODIN parameters temperatures = [1000, 500, 200, 100, 50, 20, 10, 5, 2, 1] magnitudes = np.round(np.arange(0, 0.0041, 0.0002), 4) model_param_map = get_transfer_model_param_map() image_data_format = K.image_data_format() learning_phase = 0 # 0 = test, 1 = train for modelattr in (ModelAttr(x, y) for x in model_names for y in postfixes): # In-distribution data df = {} df['In'] = pd.read_csv( os.path.join( in_dist_pred_result_folder, "{}_{}.csv".format(modelattr.model_name, modelattr.postfix))) df['In']['path'] = df['In'].apply(lambda row: os.path.join( in_dist_image_folder, row['image'] + '.jpg'), axis=1) generator_in = ImageIterator( image_paths=df['In']['path'].tolist(), labels=None, augmentation_pipeline=LesionClassifier.create_aug_pipeline_val( model_param_map[modelattr.model_name].input_size), preprocessing_function=model_param_map[ modelattr.model_name].preprocessing_func, batch_size=batch_size, shuffle=False, rescale=None, pregen_augmented_images=True, data_format=image_data_format) # Out-distribution data df['Out'] = pd.read_csv( os.path.join( out_dist_pred_result_folder, "{}_{}.csv".format(modelattr.model_name, modelattr.postfix))) df['Out']['path'] = df['Out'].apply(lambda row: os.path.join( out_dist_image_folder, row['image'] + '.jpg'), axis=1) generator_out = ImageIterator( image_paths=df['Out']['path'].tolist(), labels=None, augmentation_pipeline=LesionClassifier.create_aug_pipeline_val( model_param_map[modelattr.model_name].input_size), preprocessing_function=model_param_map[ modelattr.model_name].preprocessing_func, batch_size=batch_size, shuffle=False, rescale=None, pregen_augmented_images=True, data_format=image_data_format) # Load model model_filepath = os.path.join( model_folder, "{}_{}.hdf5".format(modelattr.model_name, modelattr.postfix)) print('Loading model: ', model_filepath) model = load_model(filepath=model_filepath, custom_objects={ 'balanced_accuracy': balanced_accuracy(num_classes) }) need_norm_perturbations = (modelattr.model_name == 'DenseNet201' or modelattr.model_name == 'ResNeXt50') for temperature in temperatures: compute_perturbations, get_scaled_dense_pred_output = get_perturbation_helper_func( model, temperature, num_classes) for magnitude in magnitudes: for dist in distributions: # Skip if the parameter combination has done param_comb_id = "{}_{}, {}, {}, {}".format( modelattr.model_name, modelattr.postfix, dist, temperature, magnitude) if param_comb_id in done_set: print('Skip ', param_comb_id) continue generator = generator_in if dist == 'In' else generator_out print( "\n===== Temperature: {}, Magnitude: {}, {}-Distribution =====" .format(temperature, magnitude, dist)) softmax_score_sub_folder = os.path.join( softmax_score_folder, "{}_{}".format(temperature, magnitude)) os.makedirs(softmax_score_sub_folder, exist_ok=True) steps = math.ceil(df[dist].shape[0] / batch_size) generator.reset() f = open( os.path.join( softmax_score_sub_folder, "{}_{}_ODIN_{}.txt".format(modelattr.model_name, modelattr.postfix, dist)), 'w') for _ in trange(steps): images = next(generator) perturbations = compute_perturbations( [images, learning_phase])[0] # Get sign of perturbations perturbations = np.sign(perturbations) # Normalize the perturbations to the same space of image # https://github.com/facebookresearch/odin/issues/5 # Perturbations divided by ISIC Training Set STD if need_norm_perturbations: perturbations = norm_perturbations( perturbations, image_data_format) # Add perturbations to images perturbative_images = images - magnitude * perturbations # Calculate the confidence after adding perturbations dense_pred_outputs = get_scaled_dense_pred_output( [perturbative_images, learning_phase])[0] softmax_probs = softmax(dense_pred_outputs) softmax_scores = np.max(softmax_probs, axis=-1) for s in softmax_scores: f.write("{}\n".format(s)) f.close() with open(progress_file, 'a') as f_done: f_done.write("{}\n".format(param_comb_id)) del model K.clear_session()
def main(): parser = argparse.ArgumentParser(description='ISIC-2019 Skin Lesion Classifiers') parser.add_argument('data', metavar='DIR', help='path to data folder') parser.add_argument('--batchsize', type=int, help='Batch size (default: %(default)s)', default=32) parser.add_argument('--maxqueuesize', type=int, help='Maximum size for the generator queue (default: %(default)s)', default=10) parser.add_argument('--epoch', type=int, help='Number of epochs (default: %(default)s)', default=100) parser.add_argument('--model', dest='models', nargs='*', choices=['Vanilla', 'DenseNet201', 'Xception', 'ResNeXt50', 'NASNetLarge', 'InceptionResNetV2'], help='Models') parser.add_argument('--autoshutdown', dest='autoshutdown', action='store_true', help='Automatically shutdown the computer after everything is done.') parser.add_argument('--training', dest='training', action='store_true', help='Train models') parser.add_argument('--predval', dest='predval', action='store_true', help='Predict validation set') parser.add_argument('--predtest', dest='predtest', action='store_true', help='Predict the test data which contains 8238 JPEG images of skin lesions.') parser.add_argument('--predvalresultfolder', help='Name of the prediction result folder for validation set (default: %(default)s)', default='val_predict_results') parser.add_argument('--predtestresultfolder', help='Name of the prediction result folder for test data (default: %(default)s)', default='test_predict_results') parser.add_argument('--modelfolder', help='Name of the model folder (default: %(default)s)', default='models') parser.add_argument('--odinscore', dest='odinscore', action='store_true', help='Only relevant if approach is 1. Computing Baseline and ODIN softmax scores.') parser.add_argument('--approach', type=int, choices=range(1, 3), required=True, help='Approach for training the models') args = parser.parse_args() print(args) # Write command to a file with open('Cmd_History.txt', 'a') as f: f.write("{}\t{}\n".format(str(datetime.datetime.utcnow()), str(args))) data_folder = args.data pred_result_folder_val = args.predvalresultfolder pred_result_folder_test = args.predtestresultfolder model_folder = args.modelfolder softmax_score_folder = 'softmax_scores' batch_size = args.batchsize max_queue_size = args.maxqueuesize epoch_num = args.epoch approach = args.approach # ISIC data training_image_folder = os.path.join(data_folder, 'ISIC_2019_Training_Input') test_image_folder = os.path.join(data_folder, 'ISIC_2019_Test_Input') # Out-of-distribution data out_dist_image_folder = os.path.join(data_folder, 'Out_Distribution') out_dist_pred_result_folder = 'out_dist_predict_results' # Ground truth of different approaches if approach == 1: ground_truth_file = os.path.join(data_folder, 'ISIC_2019_Training_GroundTruth.csv') df_ground_truth, known_category_names, unknown_category_name = load_isic_training_data(training_image_folder, ground_truth_file) category_names = known_category_names elif approach == 2: ground_truth_file = os.path.join(data_folder, 'ISIC_2019_Training_GroundTruth_DuplicateRemoved.csv') df_ground_truth, category_names = load_isic_training_and_out_dist_data(training_image_folder, ground_truth_file, out_dist_image_folder) else: print('Unknown appraoch:', approach) return df_train, df_val = train_validation_split(df_ground_truth) class_weight_dict, _ = compute_class_weight_dict(df_train) category_num = len(category_names) # Models used for predictition models_to_predict = [] workers = os.cpu_count() # Train Vanilla CNN if args.models is not None and 'Vanilla' in args.models: input_size_vanilla = (224, 224) if args.training: train_vanilla(df_train, df_val, category_num, class_weight_dict, batch_size, max_queue_size, epoch_num, input_size_vanilla, model_folder) models_to_predict.append({'model_name': 'Vanilla', 'input_size': input_size_vanilla, 'preprocessing_function': VanillaClassifier.preprocess_input}) transfer_models = args.models.copy() if 'Vanilla' in transfer_models: transfer_models.remove('Vanilla') # Train models by Transfer Learning if args.models is not None: model_param_map = get_transfer_model_param_map() if approach == 1 else get_transfer_model_param_map_2() base_model_params = [model_param_map[x] for x in transfer_models] if args.training: train_transfer_learning(base_model_params, df_train, df_val, category_num, class_weight_dict, batch_size, max_queue_size, epoch_num, model_folder) for base_model_param in base_model_params: models_to_predict.append({'model_name': base_model_param.class_name, 'input_size': base_model_param.input_size, 'preprocessing_function': base_model_param.preprocessing_func}) # Predict validation set if args.predval: os.makedirs(pred_result_folder_val, exist_ok=True) # Save Ground Truth of validation set val_ground_truth_file_path = os.path.join(pred_result_folder_val, 'Validation_Set_GroundTruth.csv') df_val.drop(columns=['path', 'category']).to_csv(path_or_buf=val_ground_truth_file_path, index=False) print("Save \"{}\"".format(val_ground_truth_file_path)) postfixes = ['best_balanced_acc', 'best_loss', 'latest'] for postfix in postfixes: for m in models_to_predict: model_filepath = os.path.join(model_folder, "{}_{}.hdf5".format(m['model_name'], postfix)) if os.path.exists(model_filepath): print("===== Predict validation set using \"{}_{}\" model =====".format(m['model_name'], postfix)) model = load_model(filepath=model_filepath, custom_objects={'balanced_accuracy': balanced_accuracy(category_num)}) LesionClassifier.predict_dataframe(model=model, df=df_val, category_names=category_names, augmentation_pipeline=LesionClassifier.create_aug_pipeline_val(m['input_size']), preprocessing_function=m['preprocessing_function'], batch_size=batch_size, workers=workers, softmax_save_file_name=os.path.join(pred_result_folder_val, "{}_{}.csv").format(m['model_name'], postfix), logit_save_file_name=os.path.join(pred_result_folder_val, "{}_{}_logit.csv").format(m['model_name'], postfix)) del model K.clear_session() else: print("\"{}\" doesn't exist".format(model_filepath)) # Predict Test Data if args.predtest: os.makedirs(pred_result_folder_test, exist_ok=True) df_test = get_dataframe_from_img_folder(test_image_folder, has_path_col=True) df_test.drop(columns=['path']).to_csv(os.path.join(pred_result_folder_test, 'ISIC_2019_Test.csv'), index=False) postfix = 'best_balanced_acc' for m in models_to_predict: model_filepath = os.path.join(model_folder, "{}_{}.hdf5".format(m['model_name'], postfix)) if os.path.exists(model_filepath): print("===== Predict test data using \"{}_{}\" model =====".format(m['model_name'], postfix)) model = load_model(filepath=model_filepath, custom_objects={'balanced_accuracy': balanced_accuracy(category_num)}) LesionClassifier.predict_dataframe(model=model, df=df_test, category_names=category_names, augmentation_pipeline=LesionClassifier.create_aug_pipeline_val(m['input_size']), preprocessing_function=m['preprocessing_function'], batch_size=batch_size, workers=workers, softmax_save_file_name=os.path.join(pred_result_folder_test, "{}_{}.csv").format(m['model_name'], postfix), logit_save_file_name=os.path.join(pred_result_folder_test, "{}_{}_logit.csv").format(m['model_name'], postfix)) del model K.clear_session() else: print("\"{}\" doesn't exist".format(model_filepath)) # Ensemble Models' Predictions on Test Data df_ensemble = ensemble_predictions(result_folder=pred_result_folder_test, category_names=category_names, save_file=False, model_names=transfer_models, postfixes=[postfix]).drop(columns=['pred_category']) if approach == 1: # Compute Out-of-Distribution scores df_score = compute_out_of_distribution_score(model_folder=model_folder, df=df_test, num_classes=category_num, batch_size=batch_size) # Merge ensemble predictions with out-of-Distribution scores df_ensemble[unknown_category_name] = df_score['out_dist_score'] df_ensemble.to_csv(os.path.join(pred_result_folder_test, "Ensemble_{}.csv".format(postfix)), index=False) # Compute Baseline and ODIN Softmax Scores if args.odinscore and approach == 1: os.makedirs(softmax_score_folder, exist_ok=True) compute_baseline_softmax_scores(in_dist_pred_result_folder=pred_result_folder_val, out_dist_pred_result_folder=out_dist_pred_result_folder, softmax_score_folder=softmax_score_folder) compute_odin_softmax_scores(in_dist_pred_result_folder=pred_result_folder_val, in_dist_image_folder=training_image_folder, out_dist_pred_result_folder=out_dist_pred_result_folder, out_dist_image_folder=out_dist_image_folder, model_folder=model_folder, softmax_score_folder=softmax_score_folder, num_classes=category_num, batch_size=batch_size) # Shutdown if args.autoshutdown: # Shutdown after 2 minutes os.system("sudo shutdown -h +2")
def evaluate_model(dataset, save_file, random_state, clf, clf_name, hyper_params, longitudinal=False, rare=True): print('reading data...', end='') features, labels, pt_ids, feature_names, zfile = read_file( dataset, longitudinal, rare) print('done.', len(labels), 'samples,', np.sum(labels == 1), 'cases,', features.shape[1], 'features') if 'Feat' in clf_name: #set feature names clf.feature_names = ','.join(feature_names).encode() n_splits = 10 cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) scoring = make_scorer(balanced_accuracy) ### # controls matching on age and sex ### idx_age = np.argmax(feature_names == 'age') idx_sex = np.argmax(feature_names == 'SEX') #sampler = NearMiss(random_state=random_state, return_indices=True) sampler = QuartileExactMatch(quart_locs=[idx_age], exact_locs=[idx_sex], random_state=random_state) print('sampling data...', end='') X, y, sidx = sampler.fit_sample(features, labels) print('sampled data contains', np.sum(y == 1), 'cases', np.sum(y == 0), 'controls') ### # split into train/test ### X_train, X_test, y_train, y_test, sidx_train, sidx_test = ( train_test_split(X, y, sidx, train_size=0.5, test_size=0.5, random_state=random_state)) # X,y,sidx = sampler.fit_sample(features[train_idx],labels[train_idx]) if len(hyper_params) > 0: param_grid = list(ParameterGrid(hyper_params)) #clone estimators Clfs = [clone(clf).set_params(**p) for p in param_grid] # fit with hyperparameter optimization cv_scores = np.zeros((len(param_grid), 10)) # cross validated scores cv_preds = np.zeros( (len(param_grid), len(y_train))) # cross validated predictions cv_probs = np.zeros( (len(param_grid), len(y_train))) # cross validated probabilities FI = np.zeros(( len(param_grid), features.shape[1])) # cross validated, permuted feature importance FI_internal = np.zeros( (len(param_grid), features.shape[1])) # cross validated feature importance ########### # this is a manual version of 10-fold cross validation with hyperparameter tuning t0 = time.process_time() for j, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)): print('fold', j) for i, est in enumerate(Clfs): print('training', type(est).__name__, i + 1, 'of', len(Clfs)) if 'Feat' in clf_name: est.logfile = (est.logfile.decode().split('.log')[0] + '.log.param' + str(i) + '.cv' + str(j)).encode() ########## # fit model ########## if longitudinal: est.fit(X_train[train_idx], y_train[train_idx], zfile, pt_ids[sidx_train[train_idx]]) else: est.fit(X_train[train_idx], y_train[train_idx]) ########## # get predictions ########## print('getting validation predictions...') if longitudinal: # cv_preds[i,val_idx] = est.predict(X_train[val_idx], # zfile,pt_ids[sidx_train[train_idx]]) if getattr(clf, "predict_proba", None): cv_probs[i, val_idx] = est.predict_proba( X_train[val_idx], zfile, pt_ids[sidx_train[train_idx]])[:, 1] elif getattr(clf, "decision_function", None): cv_probs[i, val_idx] = est.decision_function( X_train[val_idx], zfile, pt_ids[sidx_train[train_idx]]) else: # cv_preds[i,val_idx] = est.predict(X_train[val_idx]) if getattr(clf, "predict_proba", None): cv_probs[i, val_idx] = est.predict_proba( X_train[val_idx])[:, 1] elif getattr(clf, "decision_function", None): cv_probs[i, val_idx] = est.decision_function( X_train[val_idx]) ########## # scores ########## cv_scores[i, j] = roc_auc_score(y_train[val_idx], cv_probs[i, val_idx]) runtime = time.process_time() - t0 ########### print('gridsearch finished in', runtime, 'seconds') ########## # get best model and its information mean_cv_scores = [np.mean(s) for s in cv_scores] best_clf = Clfs[np.argmax(mean_cv_scores)] ########## else: print('skipping hyperparameter tuning') best_clf = clf # this option is for skipping model tuning t0 = time.process_time() print('fitting tuned model to all training data...') if longitudinal: best_clf.fit(X_train, y_train, zfile, pt_ids[sidx_train]) else: best_clf.fit(X_train, y_train) if len(hyper_params) == 0: runtime = time.process_time() - t0 # cv_predictions = cv_preds[np.argmax(mean_cv_scores)] # cv_probabilities = cv_probs[np.argmax(mean_cv_scores)] if not longitudinal: # internal feature importances cv_FI_int = compute_imp_score(best_clf, clf_name, X_train, y_train, random_state, perm=False) # cv_FI_int = FI_internal[np.argmax(mean_cv_scores)] # permutation importances FI = compute_imp_score(best_clf, clf_name, X_test, y_test, random_state, perm=True) ########## # metrics: test the best classifier on the held-out test set print('getting test predictions...') if longitudinal: print('best_clf.predict(X_test, zfile, pt_ids[sidx_test])') test_predictions = best_clf.predict(X_test, zfile, pt_ids[sidx_test]) if getattr(clf, "predict_proba", None): print('best_clf.predict_proba(X_test, zfile, pt_ids[sidx_test])') test_probabilities = best_clf.predict_proba( X_test, zfile, pt_ids[sidx_test])[:, 1] elif getattr(clf, "decision_function", None): test_probabilities = best_clf.decision_function( X_test, zfile, pt_ids[sidx_test]) else: test_predictions = best_clf.predict(X_test) if getattr(clf, "predict_proba", None): test_probabilities = best_clf.predict_proba(X_test)[:, 1] elif getattr(clf, "decision_function", None): test_probabilities = best_clf.decision_function(X_test) # # write cv_pred and cv_prob to file # df = pd.DataFrame({'cv_prediction':cv_predictions,'cv_probability':cv_probabilities, # 'pt_id':pt_ids}) # df.to_csv(save_file.split('.csv')[0] + '_' + str(random_state) + '.cv_predictions',index=None) accuracy = accuracy_score(y_test, test_predictions) macro_f1 = f1_score(y_test, test_predictions, average='macro') bal_acc = balanced_accuracy(y_test, test_predictions) roc_auc = roc_auc_score(y_test, test_probabilities) ########## # save results to file print('saving results...') param_string = ','.join([ '{}={}'.format(p, v) for p, v in best_clf.get_params().items() if p != 'feature_names' ]).replace('\n', '').replace(' ', '') out_text = '\t'.join([ dataset.split('/')[-1], clf_name, param_string, str(random_state), str(accuracy), str(macro_f1), str(bal_acc), str(roc_auc), str(runtime) ]) print(out_text) with open(save_file, 'a') as out: out.write(out_text + '\n') sys.stdout.flush() print('saving feature importance') # write feature importances if not longitudinal: feature_importance(save_file, best_clf, feature_names, X_test, y_test, random_state, clf_name, param_string, cv_FI_int, perm=False) feature_importance(save_file, best_clf, feature_names, X_test, y_test, random_state, clf_name, param_string, FI, perm=True) # write roc curves print('saving roc') roc(save_file, y_test, test_probabilities, random_state, clf_name, param_string) return best_clf