def perform_experiment(T, V, experiment_parameters, evaluation_strategy, dataset_location, n_folds=10, labelled_percentage=1.0, random_seed=None, use_unlabelled=True, use_parallel=True): """Each experiment we split the data into three parts, where two parts are used for training and remaining one is used for testing, we repeat this three times, until all parts have been considered as testing. The result of an experiment is the average performance over the three test parts.""" X, Y = load_svmlight_file(dataset_location) # ensure the dataset gets split into multiple views X_views = split_dataset_into_random_views(X, V, random_seed) # retrieve the train-test folds folds = StratifiedShuffleSplit(Y, test_size=0.3, random_state=random_seed) for train_index, test_index in folds: X_train = {n:X_views[n][train_index] for n in X_views.keys()} X_test = {n:X_views[n][test_index] for n in X_views.keys()} y_train, y_test = Y[train_index], Y[test_index] # unlabel the trainingset np.random.seed(random_seed) unlabel = np.random.random(len(y_train)) for i in range(len(unlabel)): if unlabel[i] > labelled_percentage: y_train[i] = 0.0 # grid search for the best grid best_grid = gridsearch(X_train, y_train, T, V, experiment_parameters, n_folds, evaluation_strategy, use_unlabelled, use_parallel, random_seed) # predetermine the order of samples order_of_samples = evaluation_strategy(y_train, T, use_unlabelled, random_seed) # generate the model alphas, predictions = training(X_train, y_train, V, order_of_samples, best_grid['grid']['kernel_method'], best_grid['grid']['kernel_parameters'], best_grid['grid']['lambdas']) # test the model y_preds_est = [] y_preds = [] for i in range(len(y_test)): y_pred = {} y_pred_est = 0.0 for n in range(V): y_pred[n] = coagreement_prediction_for_view_n(X_test[n][i], X_train, y_train, V, n, T+1, predictions, alphas, best_grid['grid']['kernel_method'], best_grid['grid']['kernel_parameters'], best_grid['grid']['lambdas']) y_pred_est += y_pred[n] y_preds.append(y_pred) y_preds_est.append(y_pred_est/V) # retrieve the metrics AUC, fpr, tpr = area_under_the_roc_curve(y_test, y_preds_est) print 'Achieved in validation '+str(AUC)+' AUC, and in training '+str(best_grid['AUC'])+' over '+str(n_folds)+' folds' return {"auc":AUC, "fpr":fpr, "tpr":tpr, "model":alphas, "best_grid":best_grid}
def crossvalidate(T, V, X_views, Y, folds, n_folds, use_unlabelled, evaluation_strategy, parameter_set, random_seed): AUC = 0.0 # single fold: for train_index, test_index in folds: X_train = {n:X_views[n][train_index] for n in X_views.keys()} X_test = {n:X_views[n][test_index] for n in X_views.keys()} y_train, y_test = Y[train_index], Y[test_index] order_of_samples = evaluation_strategy(y_train, T, use_unlabelled, random_seed) alphas, predictions = training(X_train, y_train, V, order_of_samples, parameter_set['kernel_method'], parameter_set['kernel_parameters'], parameter_set['lambdas']) y_preds_est = [] y_preds = [] # ensure we only use the labelled test set labelled_y_test = [item for item in y_test if item != 0.0] for i in range(len(labelled_y_test)): y_pred = {} y_pred_est = 0.0 for n in range(V): y_pred[n] = coagreement_prediction_for_view_n(X_test[n][i], X_train, y_train, V, n, T+1, predictions, alphas, parameter_set['kernel_method'], parameter_set['kernel_parameters'], parameter_set['lambdas']) y_pred_est += y_pred[n] y_preds.append(y_pred) y_preds_est.append(y_pred_est/V) AUC_fold, _, _ = area_under_the_roc_curve(labelled_y_test, y_preds_est) AUC+= AUC_fold AUC /= n_folds print 'grid search with set '+str(parameter_set)+' achieving '+str(AUC)+' over '+str(n_folds)+' folds' current_grid = {} current_grid['AUC'] = AUC current_grid['grid'] = parameter_set return current_grid