def __init__(self,
                 label_type,
                 ensemble=50,
                 scores='percentages',
                 parent=None,
                 features=None,
                 fastr_plugin='LinearExecution',
                 name='Example'):
        """
        Initialize object.

        Parameters
        ----------
        network: fastr network, default None
                If you input a network, the evaluate network is added
                to the existing network.

        """
        if parent is not None:
            self.parent = parent
            self.network = parent.network
            self.mode = 'WORC'
            self.name = parent.network.id
            self.ensemble = parent.configs[0]['Ensemble']['Use']
        else:
            self.mode = 'StandAlone'
            self.fastr_plugin = fastr_plugin
            self.name = 'WORC_Evaluate_' + name
            self.network = fastr.create_network(id=self.name)
            self.fastr_tmpdir = os.path.join(fastr.config.mounts['tmp'],
                                             self.name)
            self.ensemble = ensemble

        if features is None and self.mode == 'StandAlone':
            raise WORCexceptions.WORCIOError(
                'Either features as input or a WORC network is required for the Evaluate network.'
            )

        self.features = features

        self.label_type = label_type

        self.create_network()
Example #2
0
def load_labels(label_file, label_type):
    """Loads the label data from a label file

    Args:
        label_file (string): The path to the label file
        label_type (list): List of the names of the labels to load

    Returns:
        dict: A dict containing 'patient_IDs', 'label' and
         'label_type'
    """
    if not os.path.exists(label_file):
        raise ae.WORCKeyError(f'File {label_file} does not exist!')

    _, extension = os.path.splitext(label_file)
    if extension == '.txt':
        label_names, patient_IDs, label_status = load_label_txt(
            label_file)
    elif extension == '.csv':
        label_names, patient_IDs, label_status = load_label_csv(
            label_file)
    elif extension == '.ini':
        label_names, patient_IDs, label_status = load_label_XNAT(
            label_file)
    else:
        raise ae.WORCIOError(extension + ' is not valid label file extension.')

    print("Label names to extract: " + str(label_type))
    labels = list()
    for i_label in label_type:
        label_index = np.where(label_names == i_label)[0]
        if label_index.size == 0:
            raise ae.WORCValueError('Could not find label: ' + str(i_label))
        else:
            labels.append(label_status[:, label_index])

    label_data = dict()
    label_data['patient_IDs'] = patient_IDs
    label_data['label'] = labels
    label_data['label_name'] = label_type

    return label_data
Example #3
0
def createfixedsplits(label_file=None,
                      label_type=None,
                      patient_IDs=None,
                      test_size=0.2,
                      N_iterations=1,
                      regression=False,
                      stratify=None,
                      modus='singlelabel',
                      output=None):
    '''
    Create fixed splits for a cross validation.
    '''
    # Check whether input is valid
    if patient_IDs is None:
        if label_file is not None and label_type is not None:
            # Read the label file
            label_data = load_labels(label_file, label_type)
            patient_IDs = label_data['patient_IDs']

            # Create the stratification object
            if modus == 'singlelabel':
                stratify = label_data['label']
            elif modus == 'multilabel':
                # Create a stratification object from the labels
                # Label = 0 means no label equals one
                # Other label numbers refer to the label name that is 1
                stratify = list()
                labels = label_data['label']
                for pnum in range(0, len(labels[0])):
                    plabel = 0
                    for lnum, slabel in enumerate(labels):
                        if slabel[pnum] == 1:
                            plabel = lnum + 1
                    stratify.append(plabel)

            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)
        else:
            raise ae.WORCIOError(
                'Either a label file and label type or patient_IDs need to be provided!'
            )

    pd_dict = dict()
    for i in range(N_iterations):
        print(f'Splitting iteration {i + 1} / {N_iterations}')
        # Create a random seed for the splitting
        random_seed = np.random.randint(5000)

        # Define stratification
        unique_patient_IDs, unique_indices =\
            np.unique(np.asarray(patient_IDs), return_index=True)
        if regression:
            unique_stratify = None
        else:
            unique_stratify = [stratify[i] for i in unique_indices]

        # Split, throw error when dataset is too small for split ratio's
        try:
            unique_PID_train, indices_PID_test\
                = train_test_split(unique_patient_IDs,
                                   test_size=test_size,
                                   random_state=random_seed,
                                   stratify=unique_stratify)
        except ValueError as e:
            e = str(e) + ' Increase the size of your test set.'
            raise ae.WORCValueError(e)

        # Check for all IDs if they are in test or training
        indices_train = list()
        indices_test = list()
        patient_ID_train = list()
        patient_ID_test = list()
        for num, pid in enumerate(patient_IDs):
            if pid in unique_PID_train:
                indices_train.append(num)

                # Make sure we get a unique ID
                if pid in patient_ID_train:
                    n = 1
                    while str(pid + '_' + str(n)) in patient_ID_train:
                        n += 1
                    pid = str(pid + '_' + str(n))
                patient_ID_train.append(pid)
            else:
                indices_test.append(num)

                # Make sure we get a unique ID
                if pid in patient_ID_test:
                    n = 1
                    while str(pid + '_' + str(n)) in patient_ID_test:
                        n += 1
                    pid = str(pid + '_' + str(n))
                patient_ID_test.append(pid)

        # Add to train object
        pd_dict[str(i) + '_train'] = patient_ID_train

        # Test object has to be same length as training object
        extras = [""] * (len(patient_ID_train) - len(patient_ID_test))
        patient_ID_test.extend(extras)
        pd_dict[str(i) + '_test'] = patient_ID_test

    # Convert into pandas dataframe for easy use and conversion
    df = pd.DataFrame(pd_dict)

    # Write output if required
    if output is not None:
        print("Writing Output.")
        df.to_csv(output)

    return df
Example #4
0
def findlabeldata(patientinfo, label_type, filenames,
                  image_features_temp=None):
    """
    Load the label data and match to the unage features.

    Args:
        patientinfo (string): file with patient label data
        label_type (string): name of the label read out from patientinfo
        filenames (list): names of the patient feature files, used for matching
        image_features (np.array or list): array of the features

    Returns:
        label_data (dict): contains patient ids, their labels and the label name
    """
    # Get the labels and patient IDs
    label_data_temp = load_labels(patientinfo, label_type)
    label_data = dict()
    patient_IDs = list()
    label_value = list()
    for i_len in range(len(label_data_temp['label_name'])):
        label_value.append(list())

    # Check per feature file if there is a match in the label data
    image_features = list()
    for i_feat, feat in enumerate(filenames):
        ifound = 0
        matches = list()
        for i_num, i_patient in enumerate(label_data_temp['patient_IDs']):
            if i_patient in str(feat):

                # Match: add the patient ID to the ID's and to the matches
                patient_IDs.append(i_patient)
                matches.append(i_patient)

                # If there are feature files given, add it to the list
                if image_features_temp is not None:
                    image_features.append(image_features_temp[i_feat])

                # For each label that we have, add the value to the label list
                for i_len in range(len(label_data_temp['label_name'])):
                    label_value[i_len].append(label_data_temp['label'][i_len][i_num])

                # Calculate how many matches we found for this (feature) file: should be one
                ifound += 1

        if ifound > 1:
            message = ('Multiple matches ({}) found in labeling for feature file {}.').format(str(matches), str(feat))
            raise ae.WORCIOError(message)

        elif ifound == 0:
            message = ('No entry found in labeling for feature file {}.').format(str(feat))
            raise ae.WORCIOError(message)

    # if image_features_temp is not None:
    #     image_features = np.asarray(image_features)

    # Convert to arrays
    for i_len in range(len(label_value)):
        label_value[i_len] = np.asarray(label_value[i_len])

    label_data['patient_IDs'] = np.asarray(patient_IDs)
    label_data['label'] = np.asarray(label_value)
    label_data['label_name'] = label_data_temp['label_name']

    return label_data, image_features
Example #5
0
def plot_hyperparameters(prediction,
                         label_type=None,
                         estsize=50,
                         output=None,
                         removeconstants=False,
                         verbose=False):
    """Gather which hyperparameters have been used in the best workflows.

    Parameters
    ----------
    prediction: pandas dataframe or string, mandatory
        output of trainclassifier function, either a pandas dataframe
        or a HDF5 file

    estsize: integer, default 50
        Number of estimators that should be taken into account.

    output: filename of csv, default None
        Output file to write to. If None, not output is written, but just
        returned as a variable.

    removeconstants: boolean, default False
        Determine whether to remove any hyperparameters which have the same
        value in all workflows.

    verbose: boolean, default False
        Whether to show print messages or not.

    """
    # Load the prediction file
    if type(prediction) is not pd.core.frame.DataFrame:
        if os.path.isfile(prediction):
            prediction = pd.read_hdf(prediction)
        else:
            raise ae.WORCIOError(f'{prediction} is not an existing file!')

    # Select the estimator from the pandas dataframe to use
    keys = prediction.keys()
    if label_type is None:
        label_type = keys[0]
    prediction = prediction[label_type]

    # Loop over classifiers
    total = len(prediction.classifiers)
    for cnum, cls in enumerate(prediction.classifiers):
        if verbose:
            print(
                f'Extracting hyperparameters for iteration {cnum + 1} / {total}.'
            )
        # Get parameters and select only a set number
        parameters = cls.cv_results_['params']
        if len(parameters) > estsize:
            parameters = parameters[0:estsize]

        # Additional information besides the parameters
        for i in range(0, estsize):
            # Add which (cross-validation) iteration is used and the rank
            parameters[i]['Iteration'] = cnum + 1
            parameters[i]['Rank'] = i + 1

            # Add some statistics
            parameters[i]['Metric'] = cls.scoring
            parameters[i]['mean_train_score'] =\
                cls.cv_results_['mean_train_score'][i]
            parameters[i]['mean_fit_time'] =\
                cls.cv_results_['mean_fit_time'][i]
            parameters[i]['std_train_score'] =\
                cls.cv_results_['std_train_score'][i]
            parameters[i]['generalization_score'] =\
                cls.cv_results_['generalization_score'][i]
            parameters[i]['rank_generalization_score'] =\
                cls.cv_results_['rank_generalization_score'][i]

            # NOTE: while this is called test score, it is the score on the
            # validation dataset(s)
            parameters[i]['mean_validation_score'] =\
                cls.cv_results_['mean_test_score'][i]
            parameters[i]['std_validation_score'] =\
                cls.cv_results_['std_test_score'][i]

        # Intialize data object if this is the first iteration
        if cnum == 0:
            data = {k: list() for k in parameters[i]}

        # Add to general data object
        for p in parameters:
            for k in p.keys():
                data[k].append(p[k])

    # Optionally, remove any hyperparameters which have the same
    # value in all workflows.
    n_parameters = len(list(data.keys()))
    if removeconstants:
        if verbose:
            print('Removing parameters with constant values.')

        keys = list(data.keys())
        for k in keys:
            # First convert all values to strings so we can use set
            tempdata = [str(i) for i in data[k]]

            # Count unique values, and if only one, delete
            n_unique = len(list(set(tempdata)))
            if n_unique == 1:
                if verbose:
                    print(f'\t Removing parameter {k}.')
                del data[k]

    # Write to csv if output name is provided
    if output is not None:
        if verbose:
            print(f'Writing output to {output}.')

        # First, specify order of columns for easy reading
        columns = list(data.keys())
        starters = [
            'Iteration', 'Rank', 'Metric', 'mean_validation_score',
            'mean_train_score', 'mean_fit_time'
        ]
        for key in starters:
            columns.remove(key)
        columns = starters + columns

        # Write to dataframe
        df = pd.DataFrame(data)
        df.to_csv(output, index=False, columns=columns)

    # Display some information
    if verbose:
        print(f'Number of hyperparameters: {n_parameters}.')
        if removeconstants:
            n_parameters_unique = len(list(data.keys()))
            print(
                f'Number of hyperparameters with unique values: {n_parameters_unique}.'
            )

    return data
Example #6
0
def RankSVM_train_old(train_data, train_target, cost=1, lambda_tol=1e-6,
                  norm_tol=1e-4, max_iter=500, svm='Poly', gamma=0.05,
                  coefficient=0.05, degree=3):
    # NOTE: Only multilabel classification, not multiclass! Make a check.
    '''
         Weights,Bias,SVs = RankSVM_train(train_data,train_target,cost,lambda_tol,norm_tol,max_iter,svm,gamma,coefficient,degree)

      Description

         RankSVM_train takes,
             train_data   - An MxN array, the ith instance of training instance is stored in train_data[i,:]
             train_target - A QxM array, if the ith training instance belongs to the jth class, then train_target[j,i] equals +1, otherwise train_target(j,i) equals -1
              svm          - svm gives the type of svm used in training, which can take the value of 'RBF', 'Poly' or 'Linear'; svm.para gives the corresponding parameters used for the svm:
                             1) if svm is 'RBF', then gamma gives the value of gamma, where the kernel is exp(-Gamma*|x[i]-x[j]|^2)
                            2) if svm is 'Poly', then three values are used gamma, coefficient, and degree respectively, where the kernel is (gamma*<x[i],x[j]>+coefficient)^degree.
                            3) if svm is 'Linear', then svm is [].
             cost         - The value of 'C' used in the SVM, default=1
             lambda_tol   - The tolerance value for lambda described in the appendix of [1]; default value is 1e-6
             norm_tol     - The tolerance value for difference between alpha(p+1) and alpha(p) described in the appendix of [1]; default value is 1e-4
             max_iter     - The maximum number of iterations for RankSVM, default=500

         and returns,
              Weights          - The value for beta[ki] as described in the appendix of [1] is stored in Weights[k,i]
              Bias             - The value for b[i] as described in the appendix of [1] is stored in Bias[1,i]
              SVs              - The ith support vector is stored in SVs[:,i]


       For more details,please refer to [1] and [2].
    '''

    # RankedSVM only works for multilabel problems, not multiclass, so check
    # Whether patients have no class or multiple classes
    n_class = train_target.shape[0]
    n_object = train_target.shape[1]
    for i in range(0, n_object):
        if np.sum(train_target[:, i]) != -n_class + 2:
            raise WORCexceptions.WORCIOError('RankedSVM only works ' +
                                                   'for multilabel problems,' +
                                                   ' not multiclass. One or ' +
                                                   'more objects belong ' +
                                                   'either to no class or' +
                                                   ' multiple classes. ' +
                                                   'Please check your data' +
                                                   ' again.')

    num_training, tempvalue = np.shape(train_data)

    SVs = np.zeros(shape=(tempvalue,num_training))

    num_class, tempvalue = np.shape(train_target)
    lc = np.ones(shape=(1,num_class))

    target = np.zeros(shape=(num_class, tempvalue))
    for i in range(num_training):
        temp = train_target[:,int(i)]
        if np.logical_and(np.sum(temp) != num_class, np.sum(temp) != -num_class):
              #SVs =  (SVs, train_data[int(i),:].conj().T)
            SVs [:,i] =  train_data[int(i),:].conj().T
            target[:,i] = temp


    Dim, num_training = np.shape(SVs)
    Label = np.array(np.zeros(shape=(num_training,1)), dtype=float)
    not_Label = []
    Label_size = np.zeros(shape=(1,num_training))
    size_alpha = np.zeros(shape=(1,num_training), dtype=float)

    for i in range(num_training):
        temp1 = train_target[:,int(i)]
        Label_size[0,int(i)] = np.sum(temp1 == lc)
        lds = num_class-Label_size[0,int(i)]
        size_alpha[0,int(i)] = np.dot(lds, Label_size[0,int(i)])
        for j in range(num_class):
            if temp1[int(j)] == 1:
                 Label[int(i),0] = np.array([j])
            else:
                 not_Label.append((j))

    not_Label = np.reshape(not_Label, (num_training,num_class-1))

    kernel = np.zeros(shape =(num_training, num_training), dtype=float)

    if svm == 'RBF':
        for i in range(num_training):
            for j in range(num_training):
                kernel[int(i),int(j)] = np.exp(-gamma*(np.sum((SVs[:,i]-SVs[:,j])**2)))


    else:
        if svm == 'Poly':
            for i in range(num_training):
                for j in range(num_training):
                    ab= np.dot((np.array([SVs[:,int(j)]])),((np.array([SVs[:,int(i)]])).conj().T))
                    ab=gamma*ab
                    ab=ab+coefficient
                    ab=ab**degree
            #kernel[int(i),int(j)] = (gamma*(SVs[:,int(i)].conj().T)*SVs[:,int(j)]+coefficient)**degree
                    kernel[int(i),int(j)] = np.array([ab])
        else:
                for i in range(num_training):
                    for j in range(num_training):
                        kernel[int(i),int(j)] = np.dot((np.array([SVs[:,int(j)]])),((np.array([SVs[:,int(i)]])).conj().T))

    svm_used=svm;

    #Begin training phase

    #data initializing

    ak = np.sum(size_alpha, dtype=int)
    Alpha = np.zeros(shape=(1, ak))

    ####creating a cell c_value

    c_value = np.zeros((num_class,), dtype=np.object)

    for i in range(num_class):
        c_value[i] = np.zeros(shape=(num_class,num_class))

    for i in range(num_class):
        ak = c_value[i]
        ak[i,:]= np.ones(shape=(1,num_class))
        ak[:,i]= -np.ones(shape=(num_class,))
        c_value[i] = ak

    #print Label_size
     ### Find the Alpha value using Franke and Wolfe method [1]

    continuing = True
    iteration = 0

    while(continuing):

    #computing Beta
    #iteration=iteration+1;

    #disp(strcat('current iteration: ',num2str(iteration)))
        Beta = np.zeros(shape=(num_class,num_training))
        for k in range(num_class):
            for i in range(num_training):
                for m in range(Label_size[:,int(i)]):
                    for n in range(num_class-Label_size[:,int(i)]):
                    #index = np.sum(size_alpha[:,0:i])+(m-1)*(num_class-Label_size[i])+n
                        index = np.sum(size_alpha[:,0:i])+n

                        ak = np.array(c_value[k], dtype=int)
                        r1 = Label[int(i)]    ####this supports for only for multiclass
                                     ### if you want to work on multilabel then try this: r1 = Label[i]
                                     #################################################### r1 = r1[m]
                        c1 = not_Label[int(i)]
                        c1 = c1[n]
                        Beta[k,i] = Beta[k,i]+ak[int(r1),int(c1)]*Alpha[:,int(index)]

    ####computing gradient(ikl)

        inner = np.zeros(shape=(num_class,num_training))
        for k in range(num_class):
            for j in range(num_training):
                inner[k,j] = np.dot(Beta[k,:], kernel[:,j])

        gradient=[]

        for i in range(num_training):
            for m in range(Label_size[:,int(i)]):
                for n in range(num_class-Label_size[:,int(i)]):
                    r1 = Label[int(i)]    ####this supports only for multiclass
                                     ### if you want to work on multilabel then try this: r1 = Label[i]
                                     #################################################### r1 = r1[m]
                    c1 = not_Label[int(i)]
                    c1 = c1[n]
                    temp = inner[int(r1), int(i)]-inner[int(c1),int(i)]-1
            #gradient=np.array([gradient,temp])
                    gradient.append(float(temp))

        gradient = np.array(gradient, dtype=float)
        gradient = gradient.conj().T


    ###Find Alpha_new
        Aeq = np.zeros(shape=(num_class,np.sum(size_alpha, dtype=int)))
        for k in range(num_class):
            counter=0
            for i in range(num_training):
                for m in range (Label_size[:,int(i)]):
                    for n in range(num_class-Label_size[:,int(i)]):
                #counter+=1
                        r1 = Label[i]    ####this supports only for multiclass
                                     ### if you want to work on multilabel then try this: r1 = Label[i]
                                     #################################################### r1 = r1[m]
                        c1 = not_Label[int(i)]
                        c1 = c1[n]
                        ak = c_value[k]
                        Aeq[k,counter] = ak[int(r1),int(c1)]
                        counter+=1
    #print Aeq
        beq=np.zeros(shape=(num_class,))
        LB=np.zeros(shape=(np.sum(size_alpha, dtype=int),1))
        UB=np.zeros(shape=(np.sum(size_alpha, dtype=int),1))
        counter=0
        for i in range(num_training):
            for m in range(Label_size[:,int(i)]):
                for n in range(num_class-Label_size[:,int(i)]):
            #counter+=1
                    UB[counter,:]=cost/(size_alpha[:,i])
                    counter+=1
    #print UB
        cc = [LB.T, UB.T]
        cc =np.ravel(cc)
        bounds = np.reshape(cc, (2,np.sum(size_alpha, dtype=int)))
        bounds = bounds.T
        Alpha_new=linprog(gradient.conj().T,A_ub=None, b_ub=None, A_eq=Aeq, b_eq=beq.T,bounds=bounds)
        Alpha_new = Alpha_new.x
        Alpha_new = (np.array(Alpha_new)).conj().T

        Lambda =fminbound(neg_dual_func, 0.0, 1.0,args=  (Alpha,Alpha_new,c_value,kernel,num_training,num_class,Label,not_Label,Label_size,size_alpha))


    #print Lambda
    #Test convergence

        if np.logical_or(np.abs(Lambda)<=lambda_tol, np.dot(Lambda, np.sqrt(np.sum(((Alpha_new-Alpha)**2.))))<=norm_tol):
            continuing = False
            # np.disp('program terminated normally')
        else:
            if iteration >= max_iter:
                continuing = False

            else:
                Alpha = Alpha+np.dot(Lambda, Alpha_new-Alpha)

                iteration+=1


    Weights = Beta

    #Computing Bias

    Left = []
    Right = []
    for i in  range(num_training):
        for m in range(Label_size[:,int(i)]):
            for n in range(num_class-Label_size[:,int(i)]):
                index = np.sum(size_alpha[:,0:i])+n
                if np.logical_and(np.abs(Alpha[:,int(index)]) >= lambda_tol, np.abs(Alpha[:,int(index)]-cost/(size_alpha[:,i])) >= lambda_tol):
                    vector = np.zeros(shape=(1, num_class))
                    vector[0,int(Label[i])] = 1
                    c1 = not_Label[int(i)]
                    c1 = c1[n]
                    vector[0,int(c1)] = -1.
                    Left.append(vector)
                    Right.append(-gradient[int(index)])


    if is_empty(Left):
        Bias = np.sum(train_target.conj().T)
    else:
        bb = np.array([Right])
        ss1,ss2 = bb.shape
        aa = np.ravel(Left)
        aa = np.reshape(aa,(ss2,num_class))

        ##### Proper way to solve linear equation with non-square matrix
        Bias = np.linalg.lstsq(aa,bb.T,rcond = -1)[0]
        #Bias = Bias.T

    return Weights, Bias, SVs
def random_split_cross_validation(image_features,
                                  feature_labels,
                                  classes,
                                  patient_ids,
                                  n_iterations,
                                  param_grid,
                                  config,
                                  modus,
                                  test_size,
                                  start=0,
                                  save_data=None,
                                  tempsave=False,
                                  tempfolder=None,
                                  fixedsplits=None,
                                  fixed_seed=False,
                                  use_fastr=None,
                                  fastr_plugin=None):
    """Cross-validation in which data is randomly split in each iteration.

    Due to options of doing single-label and multi-label classification,
    stratified splitting, and regression, we use a manual loop instead
    of the default scikit-learn object.

    Parameters
    ------------

    Returns
    ------------

    """
    print('Starting random-split cross-validation.')
    logging.debug('Starting random-split cross-validation.')
    if save_data is None:
        # Start from zero, thus empty list of previos data
        save_data = list()

    for i in range(start, n_iterations):
        print(('Cross-validation iteration {} / {} .').format(
            str(i + 1), str(n_iterations)))
        logging.debug(('Cross-validation iteration {} / {} .').format(
            str(i + 1), str(n_iterations)))
        timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
        print(f'\t Time: {timestamp}.')
        logging.debug(f'\t Time: {timestamp}.')
        if fixed_seed:
            random_seed = i**2
        else:
            random_seed = np.random.randint(5000)

        t = time.time()

        # Split into test and training set, where the percentage of each
        # label is maintained
        if any(clf in regressors for clf in param_grid['classifiers']):
            # We cannot do a stratified shuffle split with regression
            stratify = None
        else:
            if modus == 'singlelabel':
                classes_temp = stratify = classes.ravel()
            elif modus == 'multilabel':
                # Create a stratification object from the labels
                # Label = 0 means no label equals one
                # Other label numbers refer to the label name that is 1
                stratify = list()
                for pnum in range(0, len(classes[0])):
                    plabel = 0
                    for lnum, slabel in enumerate(classes):
                        if slabel[pnum] == 1:
                            plabel = lnum + 1
                    stratify.append(plabel)

                # Sklearn multiclass requires rows to be objects/patients
                classes_temp = np.zeros((classes.shape[1], classes.shape[0]))
                for n_patient in range(0, classes.shape[1]):
                    for n_label in range(0, classes.shape[0]):
                        classes_temp[n_patient, n_label] = classes[n_label,
                                                                   n_patient]
            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

        if fixedsplits is None:
            # Use Random Split. Split per patient, not per sample
            unique_patient_ids, unique_indices =\
                np.unique(np.asarray(patient_ids), return_index=True)
            if any(clf in regressors for clf in param_grid['classifiers']):
                unique_stratify = None
            else:
                unique_stratify = [stratify[i] for i in unique_indices]

            try:
                unique_PID_train, indices_PID_test\
                    = train_test_split(unique_patient_ids,
                                       test_size=test_size,
                                       random_state=random_seed,
                                       stratify=unique_stratify)
            except ValueError as e:
                e = str(e) + ' Increase the size of your validation set.'
                raise ae.WORCValueError(e)

            # Check for all ids if they are in test or training
            indices_train = list()
            indices_test = list()
            patient_ID_train = list()
            patient_ID_test = list()
            for num, pid in enumerate(patient_ids):
                if pid in unique_PID_train:
                    indices_train.append(num)

                    # Make sure we get a unique ID
                    if pid in patient_ID_train:
                        n = 1
                        while str(pid + '_' + str(n)) in patient_ID_train:
                            n += 1
                        pid = str(pid + '_' + str(n))
                    patient_ID_train.append(pid)
                else:
                    indices_test.append(num)

                    # Make sure we get a unique ID
                    if pid in patient_ID_test:
                        n = 1
                        while str(pid + '_' + str(n)) in patient_ID_test:
                            n += 1
                        pid = str(pid + '_' + str(n))
                    patient_ID_test.append(pid)

            # Split features and labels accordingly
            X_train = [image_features[i] for i in indices_train]
            X_test = [image_features[i] for i in indices_test]
            if modus == 'singlelabel':
                Y_train = classes_temp[indices_train]
                Y_test = classes_temp[indices_test]
            elif modus == 'multilabel':
                Y_train = classes_temp[indices_train, :]
                Y_test = classes_temp[indices_test, :]
            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

        else:
            # Use pre defined splits
            train = fixedsplits[str(i) + '_train'].values
            test = fixedsplits[str(i) + '_test'].values

            # Convert the numbers to the correct indices
            ind_train = list()
            for j in train:
                success = False
                for num, p in enumerate(patient_ids):
                    if j == p:
                        ind_train.append(num)
                        success = True
                if not success:
                    raise ae.WORCIOError("Patient " + str(j).zfill(3) +
                                         " is not included!")

            ind_test = list()
            for j in test:
                success = False
                for num, p in enumerate(patient_ids):
                    if j == p:
                        ind_test.append(num)
                        success = True
                if not success:
                    raise ae.WORCIOError("Patient " + str(j).zfill(3) +
                                         " is not included!")

            X_train = [image_features[i] for i in ind_train]
            X_test = [image_features[i] for i in ind_test]

            patient_ID_train = patient_ids[ind_train]
            patient_ID_test = patient_ids[ind_test]

            if modus == 'singlelabel':
                Y_train = classes_temp[ind_train]
                Y_test = classes_temp[ind_test]
            elif modus == 'multilabel':
                Y_train = classes_temp[ind_train, :]
                Y_test = classes_temp[ind_test, :]
            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

        # Find best hyperparameters and construct classifier
        config['HyperOptimization']['use_fastr'] = use_fastr
        config['HyperOptimization']['fastr_plugin'] = fastr_plugin
        n_cores = config['General']['Joblib_ncores']
        trained_classifier = random_search_parameters(
            features=X_train,
            labels=Y_train,
            param_grid=param_grid,
            n_cores=n_cores,
            random_seed=random_seed,
            **config['HyperOptimization'])

        # We only want to save the feature values and one label array
        X_train = [x[0] for x in X_train]
        X_test = [x[0] for x in X_test]

        temp_save_data = (trained_classifier, X_train, X_test, Y_train, Y_test,
                          patient_ID_train, patient_ID_test, random_seed)

        save_data.append(temp_save_data)

        # Create a temporary save
        if tempsave:
            panda_labels = [
                'trained_classifier', 'X_train', 'X_test', 'Y_train', 'Y_test',
                'config', 'patient_ID_train', 'patient_ID_test', 'random_seed',
                'feature_labels'
            ]

            panda_data_temp =\
                pd.Series([trained_classifier, X_train, X_test, Y_train,
                           Y_test, config, patient_ID_train,
                           patient_ID_test, random_seed, feature_labels],
                          index=panda_labels,
                          name='Constructed crossvalidation')

            panda_data = pd.DataFrame(panda_data_temp)
            n = 0
            filename = os.path.join(tempfolder, 'tempsave_' + str(i) + '.hdf5')
            while os.path.exists(filename):
                n += 1
                filename = os.path.join(tempfolder,
                                        'tempsave_' + str(i + n) + '.hdf5')

            panda_data.to_hdf(filename, 'EstimatorData')
            del panda_data, panda_data_temp

        # Print elapsed time
        elapsed = int((time.time() - t) / 60.0)
        print(f'\t Fitting took {elapsed} minutes.')
        logging.debug(f'\t Fitting took {elapsed} minutes.')

    return save_data
Example #8
0
def crossval(config, label_data, image_features,
             param_grid=None, use_fastr=False,
             fastr_plugin=None, tempsave=False,
             fixedsplits=None, ensemble={'Use': False}, outputfolder=None,
             modus='singlelabel'):
    """
    Constructs multiple individual classifiers based on the label settings

    Parameters
    ----------
    config: dict, mandatory
            Dictionary with config settings. See the Github Wiki for the
            available fields and formatting.

    label_data: dict, mandatory
            Should contain the following:
            patient_IDs (list): IDs of the patients, used to keep track of test and
                     training sets, and label data
            label (list): List of lists, where each list contains the
                                   label status for that patient for each
                                   label
            label_name (list): Contains the different names that are stored
                                  in the label object

    image_features: numpy array, mandatory
            Consists of a tuple of two lists for each patient:
            (feature_values, feature_labels)

    param_grid: dictionary, optional
            Contains the parameters and their values wich are used in the
            grid or randomized search hyperparamater optimization. See the
            construct_classifier function for some examples.

    use_fastr: boolean, default False
            If False, parallel execution through Joblib is used for fast
            execution of the hyperparameter optimization. Especially suited
            for execution on mutlicore (H)PC's. The settings used are
            specified in the config.ini file in the IOparser folder, which you
            can adjust to your system.

            If True, fastr is used to split the hyperparameter optimization in
            separate jobs. Parameters for the splitting can be specified in the
            config file. Especially suited for clusters.

    fastr_plugin: string, default None
            Determines which plugin is used for fastr executions.
            When None, uses the default plugin from the fastr config.

    tempsave: boolean, default False
            If True, create a .hdf5 file after each cross validation containing
            the classifier and results from that that split. This is written to
            the GSOut folder in your fastr output mount. If False, only
            the result of all combined cross validations will be saved to a .hdf5
            file. This will also be done if set to True.

    fixedsplits: string, optional
            By default, random split cross validation is used to train and
            evaluate the machine learning methods. Optionally, you can provide
            a .xlsx file containing fixed splits to be used. See the Github Wiki
            for the format.

    ensemble: dictionary, optional
            Contains the configuration for constructing an ensemble.

    modus: string, default 'singlelabel'
            Determine whether one-vs-all classification (or regression) for
            each single label is used ('singlelabel') or if multilabel
            classification is performed ('multilabel').

    Returns
    ----------
    panda_data: pandas dataframe
            Contains all information on the trained classifier.

    """
    if tempsave:
        import fastr


    # Define all possible regressors
    regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet']

    # Process input data
    patient_IDs = label_data['patient_IDs']
    label_value = label_data['label']
    label_name = label_data['label_name']

    if outputfolder is None:
        logfilename = os.path.join(os.getcwd(), 'classifier.log')
    else:
        logfilename = os.path.join(outputfolder, 'classifier.log')
    print("Logging to file " + str(logfilename))

    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    logging.basicConfig(filename=logfilename, level=logging.DEBUG)
    N_iterations = config['CrossValidation']['N_iterations']
    test_size = config['CrossValidation']['test_size']

    classifier_labelss = dict()
    logging.debug('Starting classifier')

    # We only need one label instance, assuming they are all the sample
    feature_labels = image_features[0][1]

    # Check if we need to use fixedsplits:
    if fixedsplits is not None and '.xlsx' in fixedsplits:
        # fixedsplits = '/home/mstarmans/Settings/RandomSufflingOfData.xlsx'
        wb = xlrd.open_workbook(fixedsplits)
        wb = wb.sheet_by_index(1)

    if modus == 'singlelabel':
        print('Performing Single class classification.')
        logging.debug('Performing Single class classification.')
    elif modus == 'multilabel':
        print('Performing Multi label classification.')
        logging.debug('Performing Multi class classification.')
        label_value = [label_value]
        label_name = [label_name]
    else:
        m = ('{} is not a valid modus!').format(modus)
        logging.debug(m)
        raise ae.WORCKeyError(m)

    for i_class, i_name in zip(label_value, label_name):
        if modus == 'singlelabel':
            i_class_temp = i_class.ravel()

        save_data = list()

        for i in range(0, N_iterations):
            print(('Cross validation iteration {} / {} .').format(str(i + 1), str(N_iterations)))
            logging.debug(('Cross validation iteration {} / {} .').format(str(i + 1), str(N_iterations)))
            random_seed = np.random.randint(5000)

            # Split into test and training set, where the percentage of each
            # label is maintained
            if any(clf in regressors for clf in param_grid['classifiers']):
                # We cannot do a stratified shuffle split with regression
                stratify = None
            else:
                if modus == 'singlelabel':
                    stratify = i_class_temp
                elif modus == 'multilabel':
                    # Create a stratification object from the labels
                    # Label = 0 means no label equals one
                    # Other label numbers refer to the label name that is 1
                    stratify = list()
                    for pnum in range(0, len(i_class[0])):
                        plabel = 0
                        for lnum, slabel in enumerate(i_class):
                            if slabel[pnum] == 1:
                                plabel = lnum + 1
                        stratify.append(plabel)

                    # Sklearn multiclass requires rows to be objects/patients
                    # i_class = i_class.reshape(i_class.shape[1], i_class.shape[0])
                    i_class_temp = np.zeros((i_class.shape[1], i_class.shape[0]))
                    for n_patient in range(0, i_class.shape[1]):
                        for n_label in range(0, i_class.shape[0]):
                            i_class_temp[n_patient, n_label] = i_class[n_label, n_patient]
                    i_class_temp = i_class_temp
                else:
                    raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

            if fixedsplits is None:
                # Use Random Split. Split per patient, not per sample
                unique_patient_IDs, unique_indices =\
                    np.unique(np.asarray(patient_IDs), return_index=True)
                if any(clf in regressors for clf in param_grid['classifiers']):
                    unique_stratify = None
                else:
                    unique_stratify = [stratify[i] for i in unique_indices]

                try:
                    unique_PID_train, indices_PID_test\
                        = train_test_split(unique_patient_IDs,
                                           test_size=test_size,
                                           random_state=random_seed,
                                           stratify=unique_stratify)
                except ValueError as e:
                    e = str(e) + ' Increase the size of your validation set.'
                    raise ae.WORCValueError(e)

                # Check for all IDs if they are in test or training
                indices_train = list()
                indices_test = list()
                patient_ID_train = list()
                patient_ID_test = list()
                for num, pid in enumerate(patient_IDs):
                    if pid in unique_PID_train:
                        indices_train.append(num)

                        # Make sure we get a unique ID
                        if pid in patient_ID_train:
                            n = 1
                            while str(pid + '_' + str(n)) in patient_ID_train:
                                n += 1
                            pid = str(pid + '_' + str(n))
                        patient_ID_train.append(pid)
                    else:
                        indices_test.append(num)

                        # Make sure we get a unique ID
                        if pid in patient_ID_test:
                            n = 1
                            while str(pid + '_' + str(n)) in patient_ID_test:
                                n += 1
                            pid = str(pid + '_' + str(n))
                        patient_ID_test.append(pid)

                # Split features and labels accordingly
                X_train = [image_features[i] for i in indices_train]
                X_test = [image_features[i] for i in indices_test]
                if modus == 'singlelabel':
                    Y_train = i_class_temp[indices_train]
                    Y_test = i_class_temp[indices_test]
                elif modus == 'multilabel':
                    Y_train = i_class_temp[indices_train, :]
                    Y_test = i_class_temp[indices_test, :]
                else:
                    raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

            else:
                # Use pre defined splits
                indices = wb.col_values(i)
                indices = [int(j) for j in indices[1:]]  # First element is "Iteration x"
                train = indices[0:121]
                test = indices[121:]

                # Convert the numbers to the correct indices
                ind_train = list()
                for j in train:
                    success = False
                    for num, p in enumerate(patient_IDs):
                        if str(j).zfill(3) == p[0:3]:
                            ind_train.append(num)
                            success = True
                    if not success:
                        raise ae.WORCIOError("Patient " + str(j).zfill(3) + " is not included!")

                ind_test = list()
                for j in test:
                    success = False
                    for num, p in enumerate(patient_IDs):
                        if str(j).zfill(3) == p[0:3]:
                            ind_test.append(num)
                            success = True
                    if not success:
                        raise ae.WORCIOError("Patient " + str(j).zfill(3) + " is not included!")

                X_train = np.asarray(image_features)[ind_train].tolist()
                Y_train = np.asarray(i_class_temp)[ind_train].tolist()
                patient_ID_train = patient_IDs[ind_train]
                X_test = np.asarray(image_features)[ind_test].tolist()
                Y_test = np.asarray(i_class_temp)[ind_test].tolist()
                patient_ID_test = patient_IDs[ind_test]

            # Find best hyperparameters and construct classifier
            config['HyperOptimization']['use_fastr'] = use_fastr
            config['HyperOptimization']['fastr_plugin'] = fastr_plugin
            n_cores = config['General']['Joblib_ncores']
            trained_classifier = random_search_parameters(features=X_train,
                                                             labels=Y_train,
                                                             param_grid=param_grid,
                                                             n_cores=n_cores,
                                                             **config['HyperOptimization'])

            # Create an ensemble if required
            if ensemble['Use']:
                trained_classifier.create_ensemble(X_train, Y_train)

            # We only want to save the feature values and one label array
            X_train = [x[0] for x in X_train]
            X_test = [x[0] for x in X_test]

            temp_save_data = (trained_classifier, X_train, X_test, Y_train,
                              Y_test, patient_ID_train, patient_ID_test, random_seed)

            save_data.append(temp_save_data)

            # Create a temporary save
            if tempsave:
                panda_labels = ['trained_classifier', 'X_train', 'X_test', 'Y_train', 'Y_test',
                                'config', 'patient_ID_train', 'patient_ID_test',
                                'random_seed']

                panda_data_temp =\
                    pd.Series([trained_classifier, X_train, X_test, Y_train,
                               Y_test, config, patient_ID_train,
                               patient_ID_test, random_seed],
                              index=panda_labels,
                              name='Constructed crossvalidation')

                panda_data = pd.DataFrame(panda_data_temp)
                n = 0
                filename = os.path.join(fastr.config.mounts['tmp'], 'GSout', 'RS_' + str(i) + '.hdf5')
                while os.path.exists(filename):
                    n += 1
                    filename = os.path.join(fastr.config.mounts['tmp'], 'GSout', 'RS_' + str(i + n) + '.hdf5')

                if not os.path.exists(os.path.dirname(filename)):
                    os.makedirs(os.path.dirname(filename))

                panda_data.to_hdf(filename, 'SVMdata')
                del panda_data, panda_data_temp

        [classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set,
         patient_ID_train_set, patient_ID_test_set, seed_set] =\
            zip(*save_data)

        panda_labels = ['classifiers', 'X_train', 'X_test', 'Y_train', 'Y_test',
                        'config', 'patient_ID_train', 'patient_ID_test',
                        'random_seed', 'feature_labels']

        panda_data_temp =\
            pd.Series([classifiers, X_train_set, X_test_set, Y_train_set,
                       Y_test_set, config, patient_ID_train_set,
                       patient_ID_test_set, seed_set, feature_labels],
                      index=panda_labels,
                      name='Constructed crossvalidation')

        if modus == 'singlelabel':
            i_name = ''.join(i_name)
        elif modus == 'multilabel':
            i_name = ','.join(i_name)

        classifier_labelss[i_name] = panda_data_temp

    panda_data = pd.DataFrame(classifier_labelss)

    return panda_data
Example #9
0
def plot_SVM(prediction, label_data, label_type, show_plots=False,
             alpha=0.95, ensemble=False, verbose=True,
             ensemble_scoring=None, output='stats',
             modus='singlelabel'):
    '''
    Plot the output of a single binary estimator, e.g. a SVM.

    Parameters
    ----------
    prediction: pandas dataframe or string, mandatory
        output of trainclassifier function, either a pandas dataframe
        or a HDF5 file

    label_data: string, mandatory
        Contains the path referring to a .txt file containing the
        patient label(s) and value(s) to be used for learning. See
        the Github Wiki for the format.

    label_type: string, mandatory
        Name of the label to extract from the label data to test the
        estimator on.

    show_plots: Boolean, default False
        Determine whether matplotlib performance plots are made.

    alpha: float, default 0.95
        Significance of confidence intervals.

    ensemble: False, integer or 'Caruana'
        Determine whether an ensemble will be created. If so,
        either provide an integer to determine how many of the
        top performing classifiers should be in the ensemble, or use
        the string "Caruana" to use smart ensembling based on
        Caruana et al. 2004.

    verbose: boolean, default True
        Plot intermedate messages.

    ensemble_scoring: string, default None
        Metric to be used for evaluating the ensemble. If None,
        the option set in the prediction object will be used.

    output: string, default stats
        Determine which results are put out. If stats, the statistics of the
        estimator will be returned. If scores, the scores will be returned.

    Returns
    ----------
    Depending on the output parameters, the following outputs are returned:

    If output == 'stats':
    stats: dictionary
        Contains the confidence intervals of the performance metrics
        and the number of times each patient was classifier correctly
        or incorrectly.

    If output == 'scores':
    y_truths: list
        Contains the true label for each object.

    y_scores: list
        Contains the score (e.g. posterior) for each object.

    y_predictions: list
        Contains the predicted label for each object.

    PIDs: list
        Contains the patient ID/name for each object.
    '''

    # Load the prediction object if it's a hdf5 file
    if type(prediction) is not pd.core.frame.DataFrame:
        if os.path.isfile(prediction):
            prediction = pd.read_hdf(prediction)
        else:
            raise ae.WORCIOError(('{} is not an existing file!').format(str(prediction)))

    # Select the estimator from the pandas dataframe to use
    keys = prediction.keys()
    SVMs = list()
    if label_type is None:
        label_type = keys[0]

    # Load the label data
    if type(label_data) is not dict:
        if os.path.isfile(label_data):
            if type(label_type) is not list:
                # Singlelabel: convert to list
                label_type = [[label_type]]
            label_data = lp.load_labels(label_data, label_type)

    patient_IDs = label_data['patient_IDs']
    labels = label_data['label']

    if type(label_type) is list:
        # FIXME: Support for multiple label types not supported yet.
        print('[WORC Warning] Support for multiple label types not supported yet. Taking first label for plot_SVM.')
        label_type = keys[0]

    # Extract the estimators, features and labels
    SVMs = prediction[label_type]['classifiers']
    regression = is_regressor(SVMs[0].best_estimator_)
    Y_test = prediction[label_type]['Y_test']
    X_test = prediction[label_type]['X_test']
    X_train = prediction[label_type]['X_train']
    Y_train = prediction[label_type]['Y_train']
    feature_labels = prediction[label_type]['feature_labels']

    # Create lists for performance measures
    sensitivity = list()
    specificity = list()
    precision = list()
    accuracy = list()
    auc = list()
    f1_score_list = list()
    patient_classification_list = dict()
    if output in ['scores', 'decision']:
        # Keep track of all groundth truths and scores
        y_truths = list()
        y_scores = list()
        y_predictions = list()
        PIDs = list()

    # Loop over the test sets, which probably correspond with cross validation
    # iterations
    for i in range(0, len(Y_test)):
        print("\n")
        print(("Cross validation {} / {}.").format(str(i + 1), str(len(Y_test))))
        test_patient_IDs = prediction[label_type]['patient_ID_test'][i]
        train_patient_IDs = prediction[label_type]['patient_ID_train'][i]
        X_test_temp = X_test[i]
        X_train_temp = X_train[i]
        Y_train_temp = Y_train[i]
        Y_test_temp = Y_test[i]
        test_indices = list()

        # Check which patients are in the test set.
        for i_ID in test_patient_IDs:
            test_indices.append(np.where(patient_IDs == i_ID)[0][0])

            # Initiate counting how many times a patient is classified correctly
            if i_ID not in patient_classification_list:
                patient_classification_list[i_ID] = dict()
                patient_classification_list[i_ID]['N_test'] = 0
                patient_classification_list[i_ID]['N_correct'] = 0
                patient_classification_list[i_ID]['N_wrong'] = 0

            patient_classification_list[i_ID]['N_test'] += 1

        # Extract ground truth
        y_truth = Y_test_temp

        # If requested, first let the SearchCV object create an ensemble
        if ensemble:
            # NOTE: Added for backwards compatability
            if not hasattr(SVMs[i], 'cv_iter'):
                cv_iter = list(SVMs[i].cv.split(X_train_temp, Y_train_temp))
                SVMs[i].cv_iter = cv_iter

            # Create the ensemble
            X_train_temp = [(x, feature_labels) for x in X_train_temp]
            SVMs[i].create_ensemble(X_train_temp, Y_train_temp,
                                    method=ensemble, verbose=verbose,
                                    scoring=ensemble_scoring)

        # Create prediction
        y_prediction = SVMs[i].predict(X_test_temp)

        if regression:
            y_score = y_prediction
        else:
            y_score = SVMs[i].predict_proba(X_test_temp)[:, 1]

        print("Truth: " + str(y_truth))
        print("Prediction: " + str(y_prediction))

        # Add if patient was classified correctly or not to counting
        for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction, test_patient_IDs):
            if modus == 'multilabel':
                success = (i_truth == i_predict).all()
            else:
                success = i_truth == i_predict

            if success:
                patient_classification_list[i_test_ID]['N_correct'] += 1
            else:
                patient_classification_list[i_test_ID]['N_wrong'] += 1

        y_score = SVMs[i].predict_proba(X_test_temp)[:, 1]

        if output == 'decision':
            # Output the posteriors
            y_scores.append(y_score)
            y_truths.append(y_truth)
            y_predictions.append(y_prediction)
            PIDs.append(test_patient_IDs)

        elif output == 'scores':
            # Output the posteriors
            y_scores.append(y_score)
            y_truths.append(y_truth)
            y_predictions.append(y_prediction)
            PIDs.append(test_patient_IDs)

        elif output == 'stats':
            # Compute statistics
            # Compute confusion matrix and use for sensitivity/specificity
            if modus == 'singlelabel':
                # Compute singlelabel performance metrics
                if not regression:
                    accuracy_temp, sensitivity_temp, specificity_temp,\
                        precision_temp, f1_score_temp, auc_temp =\
                        metrics.performance_singlelabel(y_truth,
                                                        y_prediction,
                                                        y_score,
                                                        regression)
                else:
                    r2score, MSE, coefICC, PearsonC, PearsonP, SpearmanC,\
                        SpearmanP =\
                        metrics.performance_singlelabel(y_truth,
                                                        y_prediction,
                                                        y_score,
                                                        regression)

            elif modus == 'multilabel':
                # Convert class objects to single label per patient
                y_truth_temp = list()
                y_prediction_temp = list()
                for yt, yp in zip(y_truth, y_prediction):
                    label = np.where(yt == 1)
                    if len(label) > 1:
                        raise ae.WORCNotImplementedError('Multiclass classification evaluation is not supported in WORC.')

                    y_truth_temp.append(label[0][0])
                    label = np.where(yp == 1)
                    y_prediction_temp.append(label[0][0])

                y_truth = y_truth_temp
                y_prediction = y_prediction_temp

                # Compute multilabel performance metrics
                accuracy_temp, sensitivity_temp, specificity_temp,\
                    precision_temp, f1_score_temp, auc_temp =\
                    metrics.performance_multilabel(y_truth,
                                                   y_prediction,
                                                   y_score)

            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

            # Print AUC to keep you up to date
            print('AUC: ' + str(auc_temp))

            # Append performance to lists for all cross validations
            accuracy.append(accuracy_temp)
            sensitivity.append(sensitivity_temp)
            specificity.append(specificity_temp)
            auc.append(auc_temp)
            f1_score_list.append(f1_score_temp)
            precision.append(precision_temp)

    if output in ['scores', 'decision']:
        # Return the scores and true values of all patients
        return y_truths, y_scores, y_predictions, PIDs
    elif output == 'stats':
        # Compute statistics
        # Extract sample size
        N_1 = float(len(train_patient_IDs))
        N_2 = float(len(test_patient_IDs))

        # Compute alpha confidence intervallen
        stats = dict()
        stats["Accuracy 95%:"] = str(compute_CI.compute_confidence(accuracy, N_1, N_2, alpha))

        stats["AUC 95%:"] = str(compute_CI.compute_confidence(auc, N_1, N_2, alpha))

        stats["F1-score 95%:"] = str(compute_CI.compute_confidence(f1_score_list, N_1, N_2, alpha))

        stats["Precision 95%:"] = str(compute_CI.compute_confidence(precision, N_1, N_2, alpha))

        stats["Sensitivity 95%: "] = str(compute_CI.compute_confidence(sensitivity, N_1, N_2, alpha))

        stats["Specificity 95%:"] = str(compute_CI.compute_confidence(specificity, N_1, N_2, alpha))

        print("Accuracy 95%:" + str(compute_CI.compute_confidence(accuracy, N_1, N_2, alpha)))

        print("AUC 95%:" + str(compute_CI.compute_confidence(auc, N_1, N_2, alpha)))

        print("F1-score 95%:" + str(compute_CI.compute_confidence(f1_score_list, N_1, N_2, alpha)))

        print("Precision 95%:" + str(compute_CI.compute_confidence(precision, N_1, N_2, alpha)))

        print("Sensitivity 95%: " + str(compute_CI.compute_confidence(sensitivity, N_1, N_2, alpha)))

        print("Specificity 95%:" + str(compute_CI.compute_confidence(specificity, N_1, N_2, alpha)))

        # Extract statistics on how often patients got classified correctly
        alwaysright = dict()
        alwayswrong = dict()
        percentages = dict()
        for i_ID in patient_classification_list:
            percentage_right = patient_classification_list[i_ID]['N_correct'] / float(patient_classification_list[i_ID]['N_test'])

            if i_ID in patient_IDs:
                label = labels[0][np.where(i_ID == patient_IDs)]
            else:
                # Multiple instance of one patient
                label = labels[0][np.where(i_ID.split('_')[0] == patient_IDs)]

            label = label[0][0]
            percentages[i_ID] = str(label) + ': ' + str(round(percentage_right, 2) * 100) + '%'
            if percentage_right == 1.0:
                alwaysright[i_ID] = label
                print(("Always Right: {}, label {}").format(i_ID, label))

            elif percentage_right == 0:
                alwayswrong[i_ID] = label
                print(("Always Wrong: {}, label {}").format(i_ID, label))

        stats["Always right"] = alwaysright
        stats["Always wrong"] = alwayswrong
        stats['Percentages'] = percentages

        if show_plots:
            # Plot some characteristics in boxplots
            import matplotlib.pyplot as plt

            plt.figure()
            plt.boxplot(accuracy)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Accuracy')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(auc)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('AUC')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(precision)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Precision')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(sensitivity)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Sensitivity')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(specificity)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Specificity')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

        return stats
def plot_estimator_performance(prediction,
                               label_data,
                               label_type,
                               crossval_type=None,
                               alpha=0.95,
                               ensemble=None,
                               verbose=True,
                               ensemble_scoring=None,
                               output=None,
                               modus=None,
                               thresholds=None,
                               survival=False,
                               shuffle_estimators=False,
                               bootstrap=None,
                               bootstrap_N=None,
                               overfit_scaler=None):
    """Plot the output of a single estimator, e.g. a SVM.

    Parameters
    ----------
    prediction: pandas dataframe or string, mandatory
        output of trainclassifier function, either a pandas dataframe
        or a HDF5 file

    label_data: string, mandatory
        Contains the path referring to a .txt file containing the
        patient label(s) and value(s) to be used for learning. See
        the Github Wiki for the format.

    label_type: string, mandatory
        Name of the label to extract from the label data to test the
        estimator on.

    alpha: float, default 0.95
        Significance of confidence intervals.

    ensemble: False, integer or 'Caruana'
        Determine whether an ensemble will be created. If so,
        either provide an integer to determine how many of the
        top performing classifiers should be in the ensemble, or use
        the string "Caruana" to use smart ensembling based on
        Caruana et al. 2004.

    verbose: boolean, default True
        Plot intermedate messages.

    ensemble_scoring: string, default None
        Metric to be used for evaluating the ensemble. If None,
        the option set in the prediction object will be used.

    output: string, default stats
        Determine which results are put out. If stats, the statistics of the
        estimator will be returned. If scores, the scores will be returned.

    thresholds: list of integer(s), default None
        If None, use default threshold of sklearn (0.5) on posteriors to
        converge to a binary prediction. If one integer is provided, use that one.
        If two integers are provided, posterior < thresh[0] = 0, posterior > thresh[1] = 1.

    Returns
    ----------
    Depending on the output parameters, the following outputs are returned:

    If output == 'stats':
    stats: dictionary
        Contains the confidence intervals of the performance metrics
        and the number of times each patient was classifier correctly
        or incorrectly.

    If output == 'scores':
    y_truths: list
        Contains the true label for each object.

    y_scores: list
        Contains the score (e.g. posterior) for each object.

    y_predictions: list
        Contains the predicted label for each object.

    pids: list
        Contains the patient ID/name for each object.

    """
    # Load the prediction object if it's a hdf5 file
    if type(prediction) is not pd.core.frame.DataFrame:
        if os.path.isfile(prediction):
            prediction = pd.read_hdf(prediction)
        else:
            raise ae.WORCIOError(
                ('{} is not an existing file!').format(str(prediction)))

    # Select the estimator from the pandas dataframe to use
    keys = prediction.keys()
    if label_type is None:
        label_type = keys[0]

    # Load the label data
    if type(label_data) is not dict:
        if os.path.isfile(label_data):
            if type(label_type) is not list:
                # Singlelabel: convert to list
                label_type = [[label_type]]
            label_data = lp.load_labels(label_data, label_type)
        else:
            raise ae.WORCValueError(
                f"Label data {label_data} incorrect: not a dictionary, or file does not exist."
            )

    n_labels = len(label_type)
    patient_IDs = label_data['patient_IDs']
    labels = label_data['label']

    if type(label_type) is list:
        # FIXME: Support for multiple label types not supported yet.
        print(
            '[WORC Warning] Support for multiple label types not supported yet. Taking first label for plot_estimator_performance.'
        )
        label_type = keys[0]

    # Extract the estimators, features and labels
    regression = is_regressor(
        prediction[label_type]['classifiers'][0].best_estimator_)
    feature_labels = prediction[label_type]['feature_labels']

    # Get some configuration variables if present in the prediction
    config = prediction[label_type].config
    if ensemble is None:
        ensemble = int(config['Ensemble']['Use'])

    if modus is None:
        modus = config['Labels']['modus']

    if crossval_type is None:
        crossval_type = config['CrossValidation']['Type']

    if bootstrap is None:
        bootstrap = config['Bootstrap']['Use']

    if bootstrap_N is None:
        bootstrap_N = int(config['Bootstrap']['N_iterations'])

    if overfit_scaler is None:
        overfit_scaler = config['Evaluation']['OverfitScaler']

    ensemble_metric = config['Ensemble']['Metric']

    # Create lists for performance measures
    if not regression:
        sensitivity = list()
        specificity = list()
        precision = list()
        npv = list()
        accuracy = list()
        bca = list()
        auc = list()
        f1_score_list = list()

        if modus == 'multilabel':
            acc_av = list()

            # Also add scoring measures for all single label scores
            sensitivity_single = [list() for j in n_labels]
            specificity_single = [list() for j in n_labels]
            precision_single = [list() for j in n_labels]
            npv_single = [list() for j in n_labels]
            accuracy_single = [list() for j in n_labels]
            bca_single = [list() for j in n_labels]
            auc_single = [list() for j in n_labels]
            f1_score_list_single = [list() for j in n_labels]

    else:
        r2score = list()
        MSE = list()
        coefICC = list()
        PearsonC = list()
        PearsonP = list()
        SpearmanC = list()
        SpearmanP = list()

    patient_classification_list = dict()
    percentages_selected = list()

    if output in ['scores', 'decision'] or crossval_type == 'LOO':
        # Keep track of all groundth truths and scores
        y_truths = list()
        y_scores = list()
        y_predictions = list()
        pids = list()

    # Extract sample size
    N_1 = float(len(prediction[label_type]['patient_ID_train'][0]))
    N_2 = float(len(prediction[label_type]['patient_ID_test'][0]))

    # Convert tuples to lists if required
    if type(prediction[label_type]['X_test']) is tuple:
        prediction[label_type]['X_test'] = list(
            prediction[label_type]['X_test'])
        prediction[label_type]['X_train'] = list(
            prediction[label_type]['X_train'])
        prediction[label_type]['Y_train'] = list(
            prediction[label_type]['Y_train'])
        prediction[label_type]['Y_test'] = list(
            prediction[label_type]['Y_test'])
        prediction[label_type]['patient_ID_test'] = list(
            prediction[label_type]['patient_ID_test'])
        prediction[label_type]['patient_ID_train'] = list(
            prediction[label_type]['patient_ID_train'])
        prediction[label_type]['classifiers'] = list(
            prediction[label_type]['classifiers'])

    # Loop over the test sets, which correspond to cross-validation
    # or bootstrapping iterations
    n_iter = len(prediction[label_type]['Y_test'])
    if bootstrap:
        iterobject = range(0, bootstrap_N)
    else:
        iterobject = range(0, n_iter)

    for i in iterobject:
        print("\n")
        if bootstrap:
            print(f"Bootstrap {i + 1} / {bootstrap_N}.")
        else:
            print(f"Cross-validation {i + 1} / {n_iter}.")

        test_indices = list()

        # When bootstrapping, there is only a single train/test set.
        if bootstrap:
            if i == 0:
                X_test_temp_or = prediction[label_type]['X_test'][0]
                X_train_temp = prediction[label_type]['X_train'][0]
                Y_train_temp = prediction[label_type]['Y_train'][0]
                Y_test_temp_or = prediction[label_type]['Y_test'][0]
                test_patient_IDs_or = prediction[label_type][
                    'patient_ID_test'][0]
                train_patient_IDs = prediction[label_type]['patient_ID_train'][
                    0]
                fitted_model = prediction[label_type]['classifiers'][0]

                # Objects required for first iteration
                test_patient_IDs = test_patient_IDs_or[:]
                X_test_temp = X_test_temp_or[:]
                Y_test_temp = Y_test_temp_or[:]
        else:
            X_test_temp = prediction[label_type]['X_test'][i]
            X_train_temp = prediction[label_type]['X_train'][i]
            Y_train_temp = prediction[label_type]['Y_train'][i]
            Y_test_temp = prediction[label_type]['Y_test'][i]
            test_patient_IDs = prediction[label_type]['patient_ID_test'][i]
            train_patient_IDs = prediction[label_type]['patient_ID_train'][i]
            fitted_model = prediction[label_type]['classifiers'][i]

        # Check which patients are in the test set.
        if output == 'stats' and crossval_type != 'LOO':
            for i_ID in test_patient_IDs:
                # Initiate counting how many times a patient is classified correctly
                if i_ID not in patient_classification_list:
                    patient_classification_list[i_ID] = dict()
                    patient_classification_list[i_ID]['N_test'] = 0
                    patient_classification_list[i_ID]['N_correct'] = 0
                    patient_classification_list[i_ID]['N_wrong'] = 0

                patient_classification_list[i_ID]['N_test'] += 1

                # Check if this is exactly the label of the patient within the label file
                if i_ID not in patient_IDs:
                    print(
                        f'[WORC WARNING] Patient {i_ID} is not found the patient labels, removing underscore.'
                    )
                    i_ID = i_ID.split("_")[0]
                    if i_ID not in patient_IDs:
                        print(
                            f'[WORC WARNING] Did not help, excluding patient {i_ID}.'
                        )
                        continue

                test_indices.append(np.where(patient_IDs == i_ID)[0][0])

        # Extract ground truth
        y_truth = Y_test_temp

        # If required, shuffle estimators for "Random" ensembling
        if shuffle_estimators:
            # Randomly shuffle the estimators
            print('Shuffling estimators for random ensembling.')
            shuffle(fitted_model.cv_results_['params'])

        # If requested, first let the SearchCV object create an ensemble
        if bootstrap and i > 0:
            # For bootstrapping, only do this at the first iteration
            pass
        elif not fitted_model.ensemble:
            # If required, rank according to generalization score instead of mean_validation_score
            if ensemble_metric == 'generalization':
                print('Using generalization score for estimator ranking.')
                indices = fitted_model.cv_results_['rank_generalization_score']
                fitted_model.cv_results_['params'] = [
                    fitted_model.cv_results_['params'][i]
                    for i in indices[::-1]
                ]
            elif ensemble_metric != 'Default':
                raise ae.WORCKeyError(
                    f'Metric {ensemble_metric} is not known: use Default or generalization.'
                )

            # NOTE: Added for backwards compatability
            if not hasattr(fitted_model, 'cv_iter'):
                cv_iter = list(
                    fitted_model.cv.split(X_train_temp, Y_train_temp))
                fitted_model.cv_iter = cv_iter

            # Create the ensemble
            X_train_temp = [(x, feature_labels) for x in X_train_temp]
            fitted_model.create_ensemble(X_train_temp,
                                         Y_train_temp,
                                         method=ensemble,
                                         verbose=verbose,
                                         scoring=ensemble_scoring,
                                         overfit_scaler=overfit_scaler)

        # If bootstrap, generate a bootstrapped sample
        if bootstrap and i > 0:
            y_truth, y_prediction, y_score, test_patient_IDs =\
                resample(y_truth_all, y_prediction_all,
                         y_score_all, test_patient_IDs_or)
        else:
            # Create prediction
            y_prediction = fitted_model.predict(X_test_temp)

            if regression:
                y_score = y_prediction
            elif modus == 'multilabel':
                y_score = fitted_model.predict_proba(X_test_temp)
            else:
                y_score = fitted_model.predict_proba(X_test_temp)[:, 1]

            # Create a new binary score based on the thresholds if given
            if thresholds is not None:
                if len(thresholds) == 1:
                    y_prediction = y_score >= thresholds[0]
                elif len(thresholds) == 2:
                    # X_train_temp = [x[0] for x in X_train_temp]

                    y_score_temp = list()
                    y_prediction_temp = list()
                    y_truth_temp = list()
                    test_patient_IDs_temp = list()

                    thresholds_val = fit_thresholds(thresholds, fitted_model,
                                                    X_train_temp, Y_train_temp,
                                                    ensemble, ensemble_scoring)
                    for pnum in range(len(y_score)):
                        if y_score[pnum] <= thresholds_val[0] or y_score[
                                pnum] > thresholds_val[1]:
                            y_score_temp.append(y_score[pnum])
                            y_prediction_temp.append(y_prediction[pnum])
                            y_truth_temp.append(y_truth[pnum])
                            test_patient_IDs_temp.append(
                                test_patient_IDs[pnum])

                    perc = float(len(y_prediction_temp)) / float(
                        len(y_prediction))
                    percentages_selected.append(perc)
                    print(
                        f"Selected {len(y_prediction_temp)} from {len(y_prediction)} ({perc}%) patients using two thresholds."
                    )
                    y_score = y_score_temp
                    y_prediction = y_prediction_temp
                    y_truth = y_truth_temp
                    test_patient_IDs = test_patient_IDs_temp
                else:
                    raise ae.WORCValueError(
                        f"Need None, one or two thresholds on the posterior; got {len(thresholds)}."
                    )

            # If all scores are NaN, the classifier cannot do probabilities, thus
            # use hard predictions
            if np.sum(np.isnan(y_score)) == len(y_prediction):
                print(
                    '[WORC Warning] All scores NaN, replacing with prediction.'
                )
                y_score = y_prediction

        if bootstrap and i == 0:
            # Save objects for re-use
            y_truth_all = y_truth[:]
            y_prediction_all = y_prediction[:]
            y_score_all = y_score[:]

        print("Truth: " + str(y_truth))
        print("Prediction: " + str(y_prediction))
        print("Score: " + str(y_score))

        if output == 'stats' and crossval_type != 'LOO':
            # Add if patient was classified correctly or not to counting
            for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction,
                                                     test_patient_IDs):
                if modus == 'multilabel':
                    success = (i_truth == i_predict).all()
                else:
                    success = i_truth == i_predict

                if success:
                    patient_classification_list[i_test_ID]['N_correct'] += 1
                else:
                    patient_classification_list[i_test_ID]['N_wrong'] += 1

        if output in ['decision', 'scores'] or crossval_type == 'LOO':
            # Output the posteriors
            y_scores.append(y_score)
            y_truths.append(y_truth)
            y_predictions.append(y_prediction)
            pids.append(test_patient_IDs)

        elif output == 'stats':
            # Compute statistics
            print('Computing performance statistics.')
            # Compute confusion matrix and use for sensitivity/specificity
            performances = compute_statistics(y_truth, y_score, y_prediction,
                                              modus, regression)

            # Print AUC to keep you up to date
            if not regression:
                if modus == 'singlelabel':
                    accuracy_temp, bca_temp, sensitivity_temp,\
                        specificity_temp, precision_temp, npv_temp,\
                        f1_score_temp, auc_temp = performances
                else:
                    accuracy_temp, sensitivity_temp,\
                        specificity_temp, precision_temp, npv_temp,\
                        f1_score_temp, auc_temp, acc_av_temp,\
                        accuracy_temp_single,\
                        bca_temp_single, sensitivity_temp_single,\
                        specificity_temp_single, precision_temp_single,\
                        npv_temp_single, f1_score_temp_single,\
                        auc_temp_single = performances

                print('AUC: ' + str(auc_temp))

                # Append performance to lists for all cross validations
                accuracy.append(accuracy_temp)
                bca.append(bca_temp)
                sensitivity.append(sensitivity_temp)
                specificity.append(specificity_temp)
                auc.append(auc_temp)
                f1_score_list.append(f1_score_temp)
                precision.append(precision_temp)
                npv.append(npv_temp)

                if modus == 'multilabel':
                    acc_av.append(acc_av_temp)
                    for j in n_labels:
                        accuracy_single[j].append(accuracy_temp_single[j])
                        bca_single[j].append(bca_temp_single[j])
                        sensitivity_single[j].append(
                            sensitivity_temp_single[j])
                        specificity_single[j].append(
                            specificity_temp_single[j])
                        auc_single[j].append(auc_temp_single[j])
                        f1_score_list_single[j].append(f1_score_temp_single[j])
                        precision_single[j].append(precision_temp_single[j])
                        npv_single[j].append(npv_temp_single[j])

            else:
                r2score_temp, MSE_temp, coefICC_temp, PearsonC_temp,\
                    PearsonP_temp, SpearmanC_temp,\
                    SpearmanP_temp = performances

                print('R2 Score: ' + str(r2score_temp))
                r2score.append(r2score_temp)
                MSE.append(MSE_temp)
                coefICC.append(coefICC_temp)
                PearsonC.append(PearsonC_temp)
                PearsonP.append(PearsonP_temp)
                SpearmanC.append(SpearmanC_temp)
                SpearmanP.append(SpearmanP_temp)

        # Delete some objects to save memory in cross-validtion
        if not bootstrap:
            del fitted_model, X_test_temp, X_train_temp, Y_train_temp
            del Y_test_temp, test_patient_IDs, train_patient_IDs
            prediction[label_type]['X_test'][i] = None
            prediction[label_type]['X_train'][i] = None
            prediction[label_type]['Y_train'][i] = None
            prediction[label_type]['Y_test'][i] = None
            prediction[label_type]['patient_ID_test'][i] = None
            prediction[label_type]['patient_ID_train'][i] = None
            prediction[label_type]['classifiers'][i] = None

    if output in ['scores', 'decision']:
        # Return the scores and true values of all patients
        return y_truths, y_scores, y_predictions, pids

    elif output == 'stats':
        # Compute statistics
        stats = dict()
        output = dict()
        if crossval_type == 'LOO':
            performances = compute_statistics(y_truths, y_scores,
                                              y_predictions, modus, regression)

            if not regression:
                metric_names_single = [
                    'Accuracy', 'BCA', 'Sensitivity', 'Specificity',
                    'Precision', 'NPV', 'F1-score', 'AUC'
                ]
                if modus == 'singlelabel':
                    metric_names = metric_names_single
                elif modus == 'multilabel':
                    metric_names_multi = [
                        'Accuracy', 'Sensitivity', 'Specificity', 'Precision',
                        'NPV', 'F1-score', 'AUC', 'Average Accuracy'
                    ]
                    metric_names = metric_names_multi + metric_names_single

            else:
                # Regression
                metric_names = [
                    'R2-score', 'MSE', 'ICC', 'PearsonC', 'PearsonP',
                    'SpearmanC', 'SpearmanP'
                ]

            # Put all metrics with their names in the statistics dict
            for k, v in zip(metric_names, performances):
                stats[k] = str(v)

            if thresholds is not None:
                if len(thresholds) == 2:
                    # Compute percentage of patients that was selected
                    stats["Percentage Selected"] = str(percentages_selected[0])

            output['Statistics'] = stats

        else:
            # Compute alpha confidence intervals (CIs)
            # FIXME: multilabel performance per single label not included
            # FIXME: multilabel not working in bootstrap
            # FIXME: bootstrap not done in regression
            if not regression:
                metric_names_single = [
                    'Accuracy', 'BCA', 'Sensitivity', 'Specificity',
                    'Precision', 'NPV', 'F1-score', 'AUC'
                ]

                if bootstrap:
                    # Compute once for the real test set the performance
                    X_test_temp = prediction[label_type]['X_test'][0]
                    y_truth = prediction[label_type]['Y_test'][0]
                    y_prediction = fitted_model.predict(X_test_temp)
                    y_score = fitted_model.predict_proba(X_test_temp)[:, 1]

                    performances_test =\
                        metrics.performance_singlelabel(y_truth,
                                                        y_prediction,
                                                        y_score,
                                                        regression)
                    # Aggregate bootstrapped performances
                    performances_bootstrapped =\
                        [accuracy, bca, sensitivity, specificity, precision,
                         npv, f1_score_list, auc]

                    # Compute confidence intervals for all metrics
                    for p in range(len(metric_names_single)):
                        k = metric_names_single[p] + ' 95%'
                        perf = performances_bootstrapped[p]
                        perf_test = performances_test[p]
                        stats[
                            k] = f"{perf_test} {str(compute_confidence_bootstrap(perf, perf_test, N_1, alpha))}"

                else:
                    stats[
                        "Accuracy 95%:"] = f"{np.nanmean(accuracy)} {str(compute_confidence(accuracy, N_1, N_2, alpha))}"
                    stats[
                        "BCA 95%:"] = f"{np.nanmean(bca)} {str(compute_confidence(bca, N_1, N_2, alpha))}"
                    stats[
                        "AUC 95%:"] = f"{np.nanmean(auc)} {str(compute_confidence(auc, N_1, N_2, alpha))}"
                    stats[
                        "F1-score 95%:"] = f"{np.nanmean(f1_score_list)} {str(compute_confidence(f1_score_list, N_1, N_2, alpha))}"
                    stats[
                        "Precision 95%:"] = f"{np.nanmean(precision)} {str(compute_confidence(precision, N_1, N_2, alpha))}"
                    stats[
                        "NPV 95%:"] = f"{np.nanmean(npv)} {str(compute_confidence(npv, N_1, N_2, alpha))}"
                    stats[
                        "Sensitivity 95%: "] = f"{np.nanmean(sensitivity)} {str(compute_confidence(sensitivity, N_1, N_2, alpha))}"
                    stats[
                        "Specificity 95%:"] = f"{np.nanmean(specificity)} {str(compute_confidence(specificity, N_1, N_2, alpha))}"

                    if modus == 'multilabel':
                        stats[
                            "Average Accuracy 95%:"] = f"{np.nanmean(acc_av)} {str(compute_confidence(acc_av, N_1, N_2, alpha))}"

                if thresholds is not None:
                    if len(thresholds) == 2:
                        # Compute percentage of patients that was selected
                        stats[
                            "Percentage Selected 95%:"] = f"{np.nanmean(percentages_selected)} {str(compute_confidence(percentages_selected, N_1, N_2, alpha))}"

                # Extract statistics on how often patients got classified correctly
                rankings = dict()
                alwaysright = dict()
                alwayswrong = dict()
                percentages = dict()
                timesintestset = dict()
                for i_ID in patient_classification_list:
                    percentage_right = patient_classification_list[i_ID][
                        'N_correct'] / float(
                            patient_classification_list[i_ID]['N_test'])

                    if i_ID in patient_IDs:
                        label = labels[0][np.where(i_ID == patient_IDs)]
                    else:
                        # Multiple instance of one patient
                        label = labels[0][np.where(
                            i_ID.split('_')[0] == patient_IDs)]

                    label = label[0][0]
                    percentages[i_ID] = str(label) + ': ' + str(
                        round(percentage_right, 2) * 100) + '%'
                    if percentage_right == 1.0:
                        alwaysright[i_ID] = label
                        print(f"Always Right: {i_ID}, label {label}.")

                    elif percentage_right == 0:
                        alwayswrong[i_ID] = label
                        print(f"Always Wrong: {i_ID}, label {label}.")

                    timesintestset[i_ID] = patient_classification_list[i_ID][
                        'N_test']

                rankings["Always right"] = alwaysright
                rankings["Always wrong"] = alwayswrong
                rankings['Percentages'] = percentages
                rankings['timesintestset'] = timesintestset

                output['Rankings'] = rankings

            else:
                # Regression
                stats[
                    'R2-score 95%: '] = f"{np.nanmean(r2score)} {str(compute_confidence(r2score, N_1, N_2, alpha))}"
                stats[
                    'MSE 95%: '] = f"{np.nanmean(MSE)} {str(compute_confidence(MSE, N_1, N_2, alpha))}"
                stats[
                    'ICC 95%: '] = f"{np.nanmean(coefICC)} {str(compute_confidence(coefICC, N_1, N_2, alpha))}"
                stats[
                    'PearsonC 95%: '] = f"{np.nanmean(PearsonC)} {str(compute_confidence(PearsonC, N_1, N_2, alpha))}"
                stats[
                    'PearsonP 95%: '] = f"{np.nanmean(PearsonP)} {str(compute_confidence(PearsonP, N_1, N_2, alpha))}"
                stats[
                    'SpearmanC 95%: '] = f"{np.nanmean(SpearmanC)} {str(compute_confidence(SpearmanC, N_1, N_2, alpha))}"
                stats[
                    'SpearmanP 95%: '] = f"{np.nanmean(SpearmanP)} {str(compute_confidence(SpearmanP, N_1, N_2, alpha))}"

        # Print all CI's and add to output
        stats = OrderedDict(sorted(stats.items()))
        for k, v in stats.items():
            print(f"{k} : {v}.")

        output['Statistics'] = stats
        return output