Beispiel #1
0
def _split_train_val(patient_folders):
    """Splits the patient folders into train and validation splits.
    """
    # Construct train and validation splits using default parameters
    if paths.SUBMISSION_NR == 1:
        print("Using proper validation set")
        validation_patients_indices = validation_set.get_cross_validation_indices(
            indices=ALL_TRAIN_PATIENT_IDS, validation_index=0)
    else:
        print("WARNING: no validation set!!")
        validation_patients_indices = [1]

    train_patients_indices = [
        i for i in ALL_TRAIN_PATIENT_IDS
        if i not in validation_patients_indices
    ]

    train_patient_folders = [
        folder for folder in patient_folders
        if not _extract_id_from_path(folder) in validation_patients_indices
    ]
    validation_patient_folders = [
        folder for folder in patient_folders
        if folder not in train_patient_folders
    ]

    return (train_patient_folders, validation_patient_folders,
            validation_patients_indices, train_patients_indices)
 def test_default_split(self):
     # Create the cross-validation splits
     splits = [
         validation_set.get_cross_validation_indices(
             _TEST_INDICES, i, _TEST_NO_SPLITS, _TEST_SEED)
         for i in xrange(_TEST_NO_SPLITS)]
     # Load the right ones
     with open(_DEFAULT_SPLIT_RESULT_FILE, 'r') as f:
         target_splits = pickle.load(f)
     # Check for errors
     for gen, target in zip(splits, target_splits):
       self.assertEqual(gen, target)
Beispiel #3
0
def _split_train_val(patient_folders):
    """Splits the patient folders into train and validation splits.
    """
    # Construct train and validation splits using default parameters
    if paths.SUBMISSION_NR == 1:
        print "Using proper validation set"
        validation_patients_indices = validation_set.get_cross_validation_indices(
            indices=ALL_TRAIN_PATIENT_IDS, validation_index=0)
    else:
        print "WARNING: no validation set!!"
        validation_patients_indices = [1]

    train_patients_indices = [i for i in ALL_TRAIN_PATIENT_IDS if i not in validation_patients_indices]

    train_patient_folders = [
        folder for folder in patient_folders
        if not _extract_id_from_path(folder) in validation_patients_indices]
    validation_patient_folders = [
        folder for folder in patient_folders
        if folder not in train_patient_folders]

    return (
        train_patient_folders, validation_patient_folders,
        validation_patients_indices, train_patients_indices)
Beispiel #4
0
    NUM_TRAIN_PATIENTS = num_patients['train']
    NUM_VALID_PATIENTS = num_patients['validation']
    NUM_TEST_PATIENTS = num_patients['test']
    NUM_PATIENTS = NUM_TRAIN_PATIENTS + NUM_VALID_PATIENTS + NUM_TEST_PATIENTS


##############
# Sunny data #
##############
# This small dataset is loaded into memory
_SUNNY_DATA_PATH = os.path.join(_DATA_FOLDER, "pkl_annotated", "data.pkl")

_sunny_data = _load_file(_SUNNY_DATA_PATH)
num_sunny_images = len(_sunny_data["images"])

_validation_sunny_indices = validation_set.get_cross_validation_indices(
    indices=range(num_sunny_images))
_train_sunny_indices = [
    i for i in range(num_sunny_images) if i not in _validation_sunny_indices]

sunny_train_images = np.array(_sunny_data["images"])[_train_sunny_indices]
sunny_train_labels = np.array(_sunny_data["labels"])[_train_sunny_indices]
sunny_validation_images = np.array(_sunny_data["images"])[_validation_sunny_indices]
sunny_validation_labels = np.array(_sunny_data["labels"])[_validation_sunny_indices]


###########################
# Data form preprocessing #
###########################

_HOUGH_ROI_PATHS = (
    TEMP_FILES_PATH + 'pkl_train_slice2roi.pkl',
Beispiel #5
0
import re
from configuration import config
import cPickle as pickle
import utils
from validation_set import get_cross_validation_indices
import random

print "Loading data"

patient_folders = sorted(
    glob.glob("/data/dsb15_pkl/pkl_train/*/study/"),
    key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1)
                           ))  # glob is non-deterministic!

validation_patients_indices = get_cross_validation_indices(indices=range(
    1, 501),
                                                           validation_index=0)
train_patients_indices = [
    i for i in range(1, 501) if i not in validation_patients_indices
]

VALIDATION_REGEX = "|".join(
    ["(/%d/)" % i for i in validation_patients_indices])

train_patient_folders = [
    folder for folder in patient_folders
    if re.search(VALIDATION_REGEX, folder) is None
]
validation_patient_folders = [
    folder for folder in patient_folders if folder not in train_patient_folders
]
def optimize_expert_weights(expert_predictions,
                            average_distribution,
                            mask_matrix=None,
                            targets=None,
                            num_cross_validation_masks=2,
                            num_folds=1,
                            eps=1e-14,
                            cutoff=0.01,
                            do_optimization=True,
                            expert_weights=None,
                            optimal_params=None,
                            special_average=False,
                            *args, **kwargs):
    """
    :param expert_predictions: experts x validation_samples x 600 x
    :param mask_matrix: experts x validation_samples x
    :param targets: validation_samples x 600 x
    :param average_distribution: 600 x
    :param eps:
    :return:
    """
    if expert_weights is not None:
        mask_matrix = mask_matrix[expert_weights>cutoff,:]  # remove
        expert_predictions = expert_predictions[expert_weights>cutoff,:,:]  # remove

    NUM_EXPERTS = expert_predictions.shape[0]
    NUM_FILTER_PARAMETERS = 2
    WINDOW_SIZE = 599

    # optimizing weights
    X = theano.shared(expert_predictions.astype('float32'))  # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600)
    x_coor = theano.shared(np.linspace(-(WINDOW_SIZE-1)/2, (WINDOW_SIZE-1)/2, num=WINDOW_SIZE, dtype='float32'))  # targets = (NUM_VALIDATIONS, 600)

    NUM_VALIDATIONS = expert_predictions.shape[1]
    ind = theano.shared(np.zeros((NUM_VALIDATIONS,), dtype='int32'))  # targets = (NUM_VALIDATIONS, 600)

    if optimal_params is None:
        params_init = np.concatenate([ np.ones((NUM_EXPERTS,), dtype='float32'),
                                       np.ones((NUM_FILTER_PARAMETERS,), dtype='float32') ])
    else:
        params_init = optimal_params.astype('float32')

    params = theano.shared(params_init.astype('float32'))
    #params = T.vector('params', dtype='float32')  # expert weights = (NUM_EXPERTS,)

    C = 0.0001
    if not special_average:
        # Create theano expression
        # inputs:
        W = params[:NUM_EXPERTS]
        weights = T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0)
        preds = X.take(ind, axis=1)
        mask = theano.shared(mask_matrix.astype('float32')).take(ind, axis=1)
        # expression
        masked_weights = mask * weights
        tot_masked_weights = T.clip(masked_weights.sum(axis=0), 1e-7, utils.maxfloat)
        preds_weighted_masked = preds * masked_weights.dimshuffle(0, 1, 'x')
        cumulative_distribution = preds_weighted_masked.sum(axis=0) / tot_masked_weights.dimshuffle(0, 'x')
        # loss
        l1_loss = weights.sum()
    else:
        # calculate the weighted average for each of these experts
        weights = generate_information_weight_matrix(expert_predictions, average_distribution)  # = (NUM_EXPERTS, NUM_VALIDATIONS, 600)
        weight_matrix = theano.shared((mask_matrix[:,:,None]*weights).astype('float32'))
        pdf = utils.cdf_to_pdf(expert_predictions)
        x_log = np.log(pdf)
        x_log[pdf<=0] = np.log(eps)
        # Compute the mean
        X_log = theano.shared(x_log.astype('float32'))  # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600)
        X_log_i = X_log.take(ind, axis=1)
        w_i = weight_matrix.take(ind, axis=1)

        W = params[:NUM_EXPERTS]
        w_i = w_i * T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0, 'x')

        #the different predictions, are the experts
        geom_av_log = T.sum(X_log_i * w_i, axis=0) / (T.sum(w_i, axis=0) + eps)
        geom_av_log = geom_av_log - T.max(geom_av_log,axis=-1).dimshuffle(0,'x')  # stabilizes rounding errors?

        geom_av = T.exp(geom_av_log)

        geom_pdf = geom_av/T.sum(geom_av,axis=-1).dimshuffle(0,'x')
        l1_loss = 0
        cumulative_distribution = T.cumsum(geom_pdf, axis=-1)

    if not do_optimization:
        ind.set_value(list(range(NUM_VALIDATIONS)))
        f_eval = theano.function([], cumulative_distribution)
        cumulative_distribution = f_eval()
        return cumulative_distribution[0]
    else:
        # convert to theano_values (for regularization)
        t_valid = theano.shared(targets.astype('float32'))  # targets = (NUM_VALIDATIONS, 600)
        t_train = theano.shared(targets.astype('float32'))  # targets = (NUM_VALIDATIONS, 600)

    CRPS_train = T.mean((cumulative_distribution - t_train.take(ind, axis=0))**2) + C * l1_loss
    CRPS_valid = T.mean((cumulative_distribution - t_valid.take(ind, axis=0))**2)

    iter_optimize = theano.function([], CRPS_train, on_unused_input="ignore", updates=lasagne.updates.adam(CRPS_train, [params], 1.0))
    f_val = theano.function([], CRPS_valid)

    def optimize_my_params():
        for _ in range(40 if special_average else 100):  # early stopping
            score = iter_optimize()
        result = params.get_value()
        return result, score


    if num_cross_validation_masks==0:

        ind.set_value(list(range(NUM_VALIDATIONS)))
        params.set_value(params_init)
        optimal_params, train_score = optimize_my_params()
        final_weights = -1e10 * np.ones(expert_weights.shape,)
        final_weights[np.where(expert_weights>cutoff)] = optimal_params[:NUM_EXPERTS]
        final_params = np.concatenate(( final_weights, optimal_params[NUM_EXPERTS:]))
        return softmax(final_weights), train_score, final_params
    else:
        final_params = []
        final_losses = []
        print()
        print()
        print()
        for fold in range(num_folds):
            for i_cross_validation in range(num_cross_validation_masks):
                print("\r\033[F\033[F\033[Fcross_validation %d/%d"%(fold*num_cross_validation_masks+i_cross_validation+1, num_folds*num_cross_validation_masks))
                val_indices = get_cross_validation_indices(list(range(NUM_VALIDATIONS)),
                                                       validation_index=i_cross_validation,
                                                       number_of_splits=num_cross_validation_masks,
                                                       rng_seed=fold,
                                                       )

                indices = [i for i in range(NUM_VALIDATIONS) if i not in val_indices]


                #out, crps, d = scipy.optimize.fmin_l_bfgs_b(f, w_init, fprime=g, pgtol=1e-09, epsilon=1e-08, maxfun=10000)
                ind.set_value(indices)
                params.set_value(params_init)
                result, train_score = optimize_my_params()

                final_params.append(result)

                ind.set_value(val_indices)
                validation_score = f_val()
                print("              Current train value: %.6f" % train_score)
                print("         Current validation value: %.6f" % validation_score)
                final_losses.append(validation_score)

        optimal_params = np.mean(final_params, axis=0)
        average_loss   = np.mean(final_losses)

        expert_weights_result = softmax(optimal_params[:NUM_EXPERTS])
        filter_param_result = optimal_params[NUM_EXPERTS:NUM_EXPERTS+NUM_FILTER_PARAMETERS]
        #print "filter param result:", filter_param_result

        return expert_weights_result, average_loss, optimal_params  # (NUM_EXPERTS,)
Beispiel #7
0
    NUM_TRAIN_PATIENTS = num_patients['train']
    NUM_VALID_PATIENTS = num_patients['validation']
    NUM_TEST_PATIENTS = num_patients['test']
    NUM_PATIENTS = NUM_TRAIN_PATIENTS + NUM_VALID_PATIENTS + NUM_TEST_PATIENTS


##############
# Sunny data #
##############
# This small dataset is loaded into memory
_SUNNY_DATA_PATH = os.path.join(_DATA_FOLDER, "pkl_annotated", "data.pkl")

_sunny_data = _load_file(_SUNNY_DATA_PATH)
num_sunny_images = len(_sunny_data["images"])

_validation_sunny_indices = validation_set.get_cross_validation_indices(
    indices=range(num_sunny_images))
_train_sunny_indices = [
    i for i in range(num_sunny_images) if i not in _validation_sunny_indices
]

sunny_train_images = np.array(_sunny_data["images"])[_train_sunny_indices]
sunny_train_labels = np.array(_sunny_data["labels"])[_train_sunny_indices]
sunny_validation_images = np.array(
    _sunny_data["images"])[_validation_sunny_indices]
sunny_validation_labels = np.array(
    _sunny_data["labels"])[_validation_sunny_indices]

###########################
# Data form preprocessing #
###########################
Beispiel #8
0
import glob
import numpy as np
import re
from configuration import config
import cPickle as pickle
import utils
from validation_set import get_cross_validation_indices
import random

print "Loading data"


patient_folders = sorted(glob.glob("/data/dsb15_pkl/pkl_train/*/study/"), key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1)))  # glob is non-deterministic!

validation_patients_indices = get_cross_validation_indices(indices=range(1,501), validation_index=0)
train_patients_indices = [i for i in range(1,501) if i not in validation_patients_indices]

VALIDATION_REGEX = "|".join(["(/%d/)"%i for i in validation_patients_indices])

train_patient_folders = [folder for folder in patient_folders if re.search(VALIDATION_REGEX, folder) is None]
validation_patient_folders = [folder for folder in patient_folders if folder not in train_patient_folders]

import os
import os.path

def copy(from_folder, to_folder):
    command = "cp -r %s %s/."%(from_folder, to_folder)
    print command
    os.system(command)

for folder in train_patient_folders:
def optimize_expert_weights(expert_predictions,
                            average_distribution,
                            mask_matrix=None,
                            targets=None,
                            num_cross_validation_masks=2,
                            num_folds=1,
                            eps=1e-14,
                            cutoff=0.01,
                            do_optimization=True,
                            expert_weights=None,
                            optimal_params=None,
                            special_average=False,
                            *args, **kwargs):
    """
    :param expert_predictions: experts x validation_samples x 600 x
    :param mask_matrix: experts x validation_samples x
    :param targets: validation_samples x 600 x
    :param average_distribution: 600 x
    :param eps:
    :return:
    """
    if expert_weights is not None:
        mask_matrix = mask_matrix[expert_weights>cutoff,:]  # remove
        expert_predictions = expert_predictions[expert_weights>cutoff,:,:]  # remove

    NUM_EXPERTS = expert_predictions.shape[0]
    NUM_FILTER_PARAMETERS = 2
    WINDOW_SIZE = 599

    # optimizing weights
    X = theano.shared(expert_predictions.astype('float32'))  # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600)
    x_coor = theano.shared(np.linspace(-(WINDOW_SIZE-1)/2, (WINDOW_SIZE-1)/2, num=WINDOW_SIZE, dtype='float32'))  # targets = (NUM_VALIDATIONS, 600)

    NUM_VALIDATIONS = expert_predictions.shape[1]
    ind = theano.shared(np.zeros((NUM_VALIDATIONS,), dtype='int32'))  # targets = (NUM_VALIDATIONS, 600)

    if optimal_params is None:
        params_init = np.concatenate([ np.ones((NUM_EXPERTS,), dtype='float32'),
                                       np.ones((NUM_FILTER_PARAMETERS,), dtype='float32') ])
    else:
        params_init = optimal_params.astype('float32')

    params = theano.shared(params_init.astype('float32'))
    #params = T.vector('params', dtype='float32')  # expert weights = (NUM_EXPERTS,)

    C = 0.0001
    if not special_average:
        # Create theano expression
        # inputs:
        W = params[:NUM_EXPERTS]
        weights = T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0)
        preds = X.take(ind, axis=1)
        mask = theano.shared(mask_matrix.astype('float32')).take(ind, axis=1)
        # expression
        masked_weights = mask * weights
        tot_masked_weights = T.clip(masked_weights.sum(axis=0), 1e-7, utils.maxfloat)
        preds_weighted_masked = preds * masked_weights.dimshuffle(0, 1, 'x')
        cumulative_distribution = preds_weighted_masked.sum(axis=0) / tot_masked_weights.dimshuffle(0, 'x')
        # loss
        l1_loss = weights.sum()
    else:
        # calculate the weighted average for each of these experts
        weights = generate_information_weight_matrix(expert_predictions, average_distribution)  # = (NUM_EXPERTS, NUM_VALIDATIONS, 600)
        weight_matrix = theano.shared((mask_matrix[:,:,None]*weights).astype('float32'))
        pdf = utils.cdf_to_pdf(expert_predictions)
        x_log = np.log(pdf)
        x_log[pdf<=0] = np.log(eps)
        # Compute the mean
        X_log = theano.shared(x_log.astype('float32'))  # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600)
        X_log_i = X_log.take(ind, axis=1)
        w_i = weight_matrix.take(ind, axis=1)

        W = params[:NUM_EXPERTS]
        w_i = w_i * T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0, 'x')

        #the different predictions, are the experts
        geom_av_log = T.sum(X_log_i * w_i, axis=0) / (T.sum(w_i, axis=0) + eps)
        geom_av_log = geom_av_log - T.max(geom_av_log,axis=-1).dimshuffle(0,'x')  # stabilizes rounding errors?

        geom_av = T.exp(geom_av_log)

        geom_pdf = geom_av/T.sum(geom_av,axis=-1).dimshuffle(0,'x')
        l1_loss = 0
        cumulative_distribution = T.cumsum(geom_pdf, axis=-1)

    if not do_optimization:
        ind.set_value(range(NUM_VALIDATIONS))
        f_eval = theano.function([], cumulative_distribution)
        cumulative_distribution = f_eval()
        return cumulative_distribution[0]
    else:
        # convert to theano_values (for regularization)
        t_valid = theano.shared(targets.astype('float32'))  # targets = (NUM_VALIDATIONS, 600)
        t_train = theano.shared(targets.astype('float32'))  # targets = (NUM_VALIDATIONS, 600)

    CRPS_train = T.mean((cumulative_distribution - t_train.take(ind, axis=0))**2) + C * l1_loss
    CRPS_valid = T.mean((cumulative_distribution - t_valid.take(ind, axis=0))**2)

    iter_optimize = theano.function([], CRPS_train, on_unused_input="ignore", updates=lasagne.updates.adam(CRPS_train, [params], 1.0))
    f_val = theano.function([], CRPS_valid)

    def optimize_my_params():
        for _ in xrange(40 if special_average else 100):  # early stopping
            score = iter_optimize()
        result = params.get_value()
        return result, score


    if num_cross_validation_masks==0:

        ind.set_value(range(NUM_VALIDATIONS))
        params.set_value(params_init)
        optimal_params, train_score = optimize_my_params()
        final_weights = -1e10 * np.ones(expert_weights.shape,)
        final_weights[np.where(expert_weights>cutoff)] = optimal_params[:NUM_EXPERTS]
        final_params = np.concatenate(( final_weights, optimal_params[NUM_EXPERTS:]))
        return softmax(final_weights), train_score, final_params
    else:
        final_params = []
        final_losses = []
        print
        print
        print
        for fold in xrange(num_folds):
            for i_cross_validation in xrange(num_cross_validation_masks):
                print "\r\033[F\033[F\033[Fcross_validation %d/%d"%(fold*num_cross_validation_masks+i_cross_validation+1, num_folds*num_cross_validation_masks)
                val_indices = get_cross_validation_indices(range(NUM_VALIDATIONS),
                                                       validation_index=i_cross_validation,
                                                       number_of_splits=num_cross_validation_masks,
                                                       rng_seed=fold,
                                                       )

                indices = [i for i in range(NUM_VALIDATIONS) if i not in val_indices]


                #out, crps, d = scipy.optimize.fmin_l_bfgs_b(f, w_init, fprime=g, pgtol=1e-09, epsilon=1e-08, maxfun=10000)
                ind.set_value(indices)
                params.set_value(params_init)
                result, train_score = optimize_my_params()

                final_params.append(result)

                ind.set_value(val_indices)
                validation_score = f_val()
                print "              Current train value: %.6f" % train_score
                print "         Current validation value: %.6f" % validation_score
                final_losses.append(validation_score)

        optimal_params = np.mean(final_params, axis=0)
        average_loss   = np.mean(final_losses)

        expert_weights_result = softmax(optimal_params[:NUM_EXPERTS])
        filter_param_result = optimal_params[NUM_EXPERTS:NUM_EXPERTS+NUM_FILTER_PARAMETERS]
        #print "filter param result:", filter_param_result

        return expert_weights_result, average_loss, optimal_params  # (NUM_EXPERTS,)