Beispiel #1
0
def perform_experiment(T, V, experiment_parameters, evaluation_strategy, dataset_location, n_folds=10, labelled_percentage=1.0, random_seed=None, use_unlabelled=True, use_parallel=True):
    """Each experiment we split the data into three parts, where two parts are used
for training and remaining one is used for testing, we repeat this three times,
until all parts have been considered as testing. The result of an experiment is
the average performance over the three test parts."""
    X, Y = load_svmlight_file(dataset_location)
    
    # ensure the dataset gets split into multiple views
    X_views = split_dataset_into_random_views(X, V, random_seed)
    
    # retrieve the train-test folds
    folds = StratifiedShuffleSplit(Y, test_size=0.3, random_state=random_seed)
    for train_index, test_index in folds:
        X_train = {n:X_views[n][train_index] for n in X_views.keys()}
        X_test = {n:X_views[n][test_index] for n in X_views.keys()}
        y_train, y_test = Y[train_index], Y[test_index]
    
    # unlabel the trainingset
    np.random.seed(random_seed)
    unlabel = np.random.random(len(y_train))
    for i in range(len(unlabel)):
        if unlabel[i] > labelled_percentage:
            y_train[i] = 0.0
    
    # grid search for the best grid
    best_grid = gridsearch(X_train, y_train, T, V, experiment_parameters, n_folds, evaluation_strategy, use_unlabelled, use_parallel, random_seed)
    
    # predetermine the order of samples
    order_of_samples = evaluation_strategy(y_train, T, use_unlabelled, random_seed)

    # generate the model
    alphas, predictions = training(X_train, y_train, V, order_of_samples, best_grid['grid']['kernel_method'], best_grid['grid']['kernel_parameters'], best_grid['grid']['lambdas'])
    
    # test the model    
    y_preds_est = []
    y_preds = []
    for i in range(len(y_test)):
        y_pred = {}
        y_pred_est = 0.0
        for n in range(V):
            y_pred[n] = coagreement_prediction_for_view_n(X_test[n][i], X_train, y_train, V, n, T+1, predictions, alphas, best_grid['grid']['kernel_method'], best_grid['grid']['kernel_parameters'], best_grid['grid']['lambdas'])
            y_pred_est += y_pred[n]
        y_preds.append(y_pred)
        y_preds_est.append(y_pred_est/V)
    
    # retrieve the metrics
    AUC, fpr, tpr = area_under_the_roc_curve(y_test, y_preds_est)
    
    print 'Achieved in validation '+str(AUC)+' AUC, and in training '+str(best_grid['AUC'])+' over '+str(n_folds)+' folds'
        
    return {"auc":AUC, "fpr":fpr, "tpr":tpr, "model":alphas, "best_grid":best_grid}
Beispiel #2
0
def crossvalidate(T, V, X_views, Y, folds, n_folds, use_unlabelled, evaluation_strategy, parameter_set, random_seed):
    AUC = 0.0
    # single fold:
    for train_index, test_index in folds:
        X_train = {n:X_views[n][train_index] for n in X_views.keys()}
        X_test = {n:X_views[n][test_index] for n in X_views.keys()}
        y_train, y_test = Y[train_index], Y[test_index]
    
        order_of_samples = evaluation_strategy(y_train, T, use_unlabelled, random_seed)

        alphas, predictions = training(X_train, y_train, V, order_of_samples, parameter_set['kernel_method'], parameter_set['kernel_parameters'], parameter_set['lambdas'])
        
        y_preds_est = []
        y_preds = []
        
        # ensure we only use the labelled test set
        labelled_y_test = [item for item in y_test if item != 0.0]
        for i in range(len(labelled_y_test)):
            y_pred = {}
            y_pred_est = 0.0
            for n in range(V):
                y_pred[n] = coagreement_prediction_for_view_n(X_test[n][i], X_train, y_train, V, n, T+1, predictions, alphas, parameter_set['kernel_method'], parameter_set['kernel_parameters'], parameter_set['lambdas'])
                y_pred_est += y_pred[n]
            y_preds.append(y_pred)
            y_preds_est.append(y_pred_est/V)
        
        AUC_fold, _, _ = area_under_the_roc_curve(labelled_y_test, y_preds_est)
        AUC+= AUC_fold
        
    AUC /= n_folds
    
    print 'grid search with set '+str(parameter_set)+' achieving '+str(AUC)+' over '+str(n_folds)+' folds'
    
    current_grid = {}
    current_grid['AUC'] = AUC
    current_grid['grid'] = parameter_set
    
    return current_grid