def custom_config_overrides(self, config):
        # First check if these overrides are in the actual WORC config
        dummy = WORC()
        defaultconfig = dummy.defaultconfig()
        for k in config.keys():
            if k not in list(defaultconfig.keys()):
                raise WORCKeyError(f'Key "{k}" is not in the WORC config!.')

            # Check also sub config
            for k2 in config[k].keys():
                if k2 not in list(defaultconfig[k].keys()):
                    raise WORCKeyError(f'Key "{k2}" is not in part "{k}" of the WORC config!.')

        # Actually update
        _deep_update(self._custom_overrides, config)
Esempio n. 2
0
def ComBat(features_train_in,
           labels_train,
           config,
           features_train_out,
           features_test_in=None,
           labels_test=None,
           features_test_out=None,
           VarianceThreshold=True,
           scaler=False,
           logarithmic=False):
    """
    Apply ComBat feature harmonization.

    Based on: https://github.com/Jfortin1/ComBatHarmonization
    """
    # Load the config
    print('############################################################')
    print('#                    Initializing ComBat.                  #')
    print('############################################################\n')
    config = cio.load_config(config)
    excluded_features = config['ComBat']['excluded_features']

    # If mod, than also load moderating labels
    if config['ComBat']['mod'][0] == '[]':
        label_names = config['ComBat']['batch']
    else:
        label_names = config['ComBat']['batch'] + config['ComBat']['mod']

    # Load the features for both training and testing, match with batch and mod parameters
    label_data_train, image_features_train =\
        wio.load_features(features_train_in, patientinfo=labels_train,
                          label_type=label_names)

    feature_labels = image_features_train[0][1]
    image_features_train = [i[0] for i in image_features_train]
    label_data_train['patient_IDs'] = list(label_data_train['patient_IDs'])

    # Exclude features
    if excluded_features:
        print(f'\t Excluding features containing: {excluded_features}')
        # Determine indices of excluded features
        included_feature_indices = []
        excluded_feature_indices = []
        for fnum, i in enumerate(feature_labels):
            if not any(e in i for e in excluded_features):
                included_feature_indices.append(fnum)
            else:
                excluded_feature_indices.append(fnum)

        # Actually exclude the features
        image_features_train_combat = [
            np.asarray(i)[included_feature_indices].tolist()
            for i in image_features_train
        ]
        feature_labels_combat = np.asarray(
            feature_labels)[included_feature_indices].tolist()

        image_features_train_noncombat = [
            np.asarray(i)[excluded_feature_indices].tolist()
            for i in image_features_train
        ]
        feature_labels_noncombat = np.asarray(
            feature_labels)[excluded_feature_indices].tolist()

    else:
        image_features_train_combat = image_features_train
        feature_labels_combat = feature_labels.tolist()

        image_features_train_noncombat = []
        feature_labels_noncombat = []

    # Detect NaNs, otherwise first feature imputation is required
    if any(
            np.isnan(a)
            for a in np.asarray(image_features_train_combat).flatten()):
        print('\t [WARNING] NaNs detected, applying median imputation')
        imputer = Imputer(missing_values=np.nan, strategy='median')
        imputer.fit(image_features_train_combat)
        image_features_train_combat = imputer.transform(
            image_features_train_combat)
    else:
        imputer = None

    # Apply a scaler to the features
    if scaler:
        print('\t Fitting scaler on dataset.')
        scaler = StandardScaler().fit(image_features_train_combat)
        image_features_train_combat = scaler.transform(
            image_features_train_combat)

    # Remove features with a constant value
    if VarianceThreshold:
        print(f'\t Applying variance threshold on dataset.')
        image_features_train_combat, feature_labels_combat, VarSel =\
            selfeat_variance(image_features_train_combat, np.asarray([feature_labels_combat]))
        feature_labels_combat = feature_labels_combat[0].tolist()

    if features_test_in:
        label_data_test, image_features_test =\
            wio.load_features(features_test_in, patientinfo=labels_test,
                              label_type=label_names)

        image_features_test = [i[0] for i in image_features_test]
        label_data_test['patient_IDs'] = list(label_data_test['patient_IDs'])

        if excluded_features:
            image_features_test_combat = [
                np.asarray(i)[included_feature_indices].tolist()
                for i in image_features_test
            ]
            image_features_test_noncombat = [
                np.asarray(i)[excluded_feature_indices].tolist()
                for i in image_features_test
            ]
        else:
            image_features_test_combat = image_features_test
            image_features_test_noncombat = []

        # Apply imputation if required
        if imputer is not None:
            image_features_test_combat = imputer.transform(
                image_features_test_combat)

        # Apply a scaler to the features
        if scaler:
            image_features_test_combat = scaler.transform(
                image_features_test_combat)

        # Remove features with a constant value
        if VarianceThreshold:
            image_features_test_combat = VarSel.transform(
                image_features_test_combat)

        all_features = image_features_train_combat.tolist(
        ) + image_features_test_combat.tolist()
        all_labels = list()
        for i in range(label_data_train['label'].shape[0]):
            all_labels.append(label_data_train['label'][i, :, 0].tolist() +
                              label_data_test['label'][i, :, 0].tolist())
        all_labels = np.asarray(all_labels)
    else:
        all_features = image_features_train_combat.tolist()
        all_labels = label_data_train['label']

    # Convert data to a single array
    all_features_matrix = np.asarray(all_features)
    all_labels = np.squeeze(all_labels)

    # Apply logarithm if required
    if logarithmic:
        print('\t Taking log10 of features before applying ComBat.')
        all_features_matrix = np.log10(all_features_matrix)

    # Convert all_labels to dictionary
    if len(all_labels.shape) == 1:
        # No mod variables
        all_labels = {label_data_train['label_name'][0]: all_labels}
    else:
        all_labels = {
            k: v
            for k, v in zip(label_data_train['label_name'], all_labels)
        }

    # Split labels in batch and moderation labels
    bat = config['ComBat']['batch']
    mod = config['ComBat']['mod']
    print(f'\t Using batch variable {bat}, mod variables {mod}.')
    batch = [
        all_labels[l] for l in all_labels.keys()
        if l in config['ComBat']['batch']
    ]
    batch = batch[0]
    if config['ComBat']['mod'][0] == '[]':
        mod = None
    else:
        mod = [
            all_labels[l] for l in all_labels.keys()
            if l in config['ComBat']['mod']
        ]

    # Set parameters for output files
    parameters = {
        'batch': config['ComBat']['batch'],
        'mod': config['ComBat']['mod'],
        'par': config['ComBat']['par']
    }
    name = 'Image features: ComBat corrected'
    panda_labels = [
        'parameters', 'patient', 'feature_values', 'feature_labels'
    ]
    feature_labels = feature_labels_combat + feature_labels_noncombat

    # Convert all inputs to arrays with right shape
    all_features_matrix = np.transpose(all_features_matrix)
    if mod is not None:
        mod = np.transpose(np.asarray(mod))

    # Patients identified with batch -1.0 should be skipped
    skipname = 'Image features: ComBat skipped'
    ntrain = len(image_features_train_combat)
    ndel = 0
    print(features_test_out)
    for bnum, b in enumerate(batch):
        bnum -= ndel
        if b == -1.0:
            if bnum < ntrain - ndel:
                # Training patient
                print('train')
                pid = label_data_train['patient_IDs'][bnum]
                out = features_train_out[bnum]

                # Combine ComBat and non-ComBat features
                feature_values_temp = list(
                    all_features_matrix[:, bnum]) + list(
                        image_features_train_noncombat[bnum])

                # Delete patient for later processing
                del label_data_train['patient_IDs'][bnum]
                del image_features_train_noncombat[bnum]
                del features_train_out[bnum]
                image_features_train_combat = np.delete(
                    image_features_train_combat, bnum, 0)

            else:
                # Test patient
                print('test')
                pid = label_data_test['patient_IDs'][bnum - ntrain]
                out = features_test_out[bnum - ntrain]

                # Combine ComBat and non-ComBat features
                feature_values_temp = list(
                    all_features_matrix[:, bnum]) + list(
                        image_features_test_noncombat[bnum - ntrain])

                # Delete patient for later processing
                del label_data_test['patient_IDs'][bnum - ntrain]
                del image_features_test_noncombat[bnum - ntrain]
                del features_test_out[bnum - ntrain]
                image_features_test_combat = np.delete(
                    image_features_test_combat, bnum - ntrain, 0)

            # Delete some other variables for later processing
            all_features_matrix = np.delete(all_features_matrix, bnum, 1)
            if mod is not None:
                mod = np.delete(mod, bnum, 0)
            batch = np.delete(batch, bnum, 0)

            # Notify user
            print(
                f'[WARNING] Skipping patient {pid} as batch variable is -1.0.')

            # Sort based on feature label
            feature_labels_temp, feature_values_temp =\
                zip(*sorted(zip(feature_labels, feature_values_temp)))

            # Convert to pandas Series and save as hdf5
            panda_data = pd.Series(
                [parameters, pid, feature_values_temp, feature_labels_temp],
                index=panda_labels,
                name=skipname)

            print(f'\t Saving image features to: {out}.')
            panda_data.to_hdf(out, 'image_features')

            ndel += 1

    print(features_test_out)
    # Run ComBat in Matlab
    if config['ComBat']['language'] == 'matlab':
        print('\t Executing ComBat through Matlab')
        data_harmonized = ComBatMatlab(
            dat=all_features_matrix,
            batch=batch,
            command=config['ComBat']['matlab'],
            mod=mod,
            par=config['ComBat']['par'],
            per_feature=config['ComBat']['per_feature'])

    elif config['ComBat']['language'] == 'python':
        print('\t Executing ComBat through neuroComBat in Python')
        data_harmonized = ComBatPython(
            dat=all_features_matrix,
            batch=batch,
            mod=mod,
            eb=config['ComBat']['eb'],
            par=config['ComBat']['par'],
            per_feature=config['ComBat']['per_feature'])
    else:
        raise WORCKeyError(f"Language {config['ComBat']['language']} unknown.")

    # Convert values back if logarithm was used
    if logarithmic:
        data_harmonized = 10**data_harmonized

    # Convert again to train hdf5 files
    feature_values_train_combat = [
        data_harmonized[:, i] for i in range(len(image_features_train_combat))
    ]
    for fnum, i_feat in enumerate(feature_values_train_combat):
        # Combine ComBat and non-ComBat features
        feature_values_temp = i_feat.tolist(
        ) + image_features_train_noncombat[fnum]

        # Sort based on feature label
        feature_labels_temp, feature_values_temp =\
            zip(*sorted(zip(feature_labels, feature_values_temp)))

        # Convert to pandas Series and save as hdf5
        pid = label_data_train['patient_IDs'][fnum]
        panda_data = pd.Series(
            [parameters, pid, feature_values_temp, feature_labels_temp],
            index=panda_labels,
            name=name)

        print(f'Saving image features to: {features_train_out[fnum]}.')
        panda_data.to_hdf(features_train_out[fnum], 'image_features')

    # Repeat for testing if required
    if features_test_in:
        print(len(image_features_test_combat))
        print(data_harmonized.shape[1])
        feature_values_test_combat = [
            data_harmonized[:, i] for i in range(
                data_harmonized.shape[1] -
                len(image_features_test_combat), data_harmonized.shape[1])
        ]
        for fnum, i_feat in enumerate(feature_values_test_combat):
            print(fnum)
            # Combine ComBat and non-ComBat features
            feature_values_temp = i_feat.tolist(
            ) + image_features_test_noncombat[fnum]

            # Sort based on feature label
            feature_labels_temp, feature_values_temp =\
                zip(*sorted(zip(feature_labels, feature_values_temp)))

            # Convert to pandas Series and save as hdf5
            pid = label_data_test['patient_IDs'][fnum]
            panda_data = pd.Series(
                [parameters, pid, feature_values_temp, feature_labels_temp],
                index=panda_labels,
                name=name)

            print(f'Saving image features to: {features_test_out[fnum]}.')
            panda_data.to_hdf(features_test_out[fnum], 'image_features')
Esempio n. 3
0
def plot_ranked_scores(estimator,
                       pinfo,
                       label_type,
                       scores='percentages',
                       images=[],
                       segmentations=[],
                       ensemble=50,
                       output_csv=None,
                       output_zip=None,
                       output_itk=None):
    '''
    Rank the patients according to their average score. The score can either
    be the average posterior or the percentage of times the patient was
    classified correctly in the cross validations. Additionally,
    the middle slice of each patient is plot and saved according to the ranking.

    Parameters
    ----------
    estimator: filepath, mandatory
        Path pointing to the .hdf5 file which was is the output of the
        trainclassifier function.

    pinfo: filepath, mandatory
        Path pointint to the .txt file which contains the patient label
        information.

    label_type: string, default None
        The name of the label predicted by the estimator. If None,
        the first label from the prediction file will be used.

    scores: string, default percentages
        Type of scoring to be used. Either 'posteriors' or 'percentages'.

    images: list, optional
        List containing the filepaths to the ITKImage image files of the
        patients.

    segmentations: list, optional
        List containing the filepaths to the ITKImage segmentation files of
        the patients.

    ensemble: integer or string, optional
        Method to be used for ensembling. Either an integer for a fixed size
        or 'Caruana' for the Caruana method, see the SearchCV function for more
        details.

    output_csv: filepath, optional
        If given, the scores will be written to this csv file.

    output_zip: filepath, optional
        If given, the images will be plotted and the pngs saved to this
        zip file.

    output_itk: filepath, optional
        WIP

    '''
    prediction = pd.read_hdf(estimator)
    if label_type is None:
        # Assume we want to have the first key
        label_type = prediction.keys()[0]

    if scores == 'posteriors':
        ranked_scores, ranked_truths, ranked_PIDs =\
            plot_ranked_posteriors(estimator=estimator,
                                   pinfo=pinfo,
                                   label_type=label_type,
                                   ensemble=ensemble,
                                   output_csv=output_csv)
    elif scores == 'percentages':
        ranked_scores, ranked_truths, ranked_PIDs =\
            plot_ranked_percentages(estimator=estimator,
                                    pinfo=pinfo,
                                    label_type=label_type,
                                    ensemble=ensemble,
                                    output_csv=output_csv)
    else:
        message = ('{} is not a valid scoring method!').format(str(scores))
        raise WORCKeyError(message)

    if output_zip is not None or output_itk is not None:
        # Rerank the scores split per ground truth class: negative for 0, positive for 1
        ranked_scores_temp = list()
        for l, p in zip(ranked_truths, ranked_scores):
            if l == 0:
                ranked_scores_temp.append(-p)
            else:
                ranked_scores_temp.append(p)

        ranked_scores = ranked_scores_temp
        ranking = np.argsort(ranked_scores)
        ranked_scores = [ranked_scores[r] for r in ranking]
        ranked_truths = [ranked_truths[r] for r in ranking]
        ranked_PIDs = [ranked_PIDs[r] for r in ranking]

        # Convert to lower to later on overcome matching errors
        ranked_PIDs = [i.lower() for i in ranked_PIDs]

        plot_ranked_images(pinfo=pinfo,
                           label_type=label_type,
                           images=images,
                           segmentations=segmentations,
                           ranked_truths=ranked_truths,
                           ranked_scores=ranked_scores,
                           ranked_PIDs=ranked_PIDs,
                           output_zip=output_zip,
                           output_itk=output_itk)
Esempio n. 4
0
        ranked_scores, ranked_truths, ranked_PIDs =\
            plot_ranked_posteriors(estimator=estimator,
                                   pinfo=pinfo,
                                   label_type=label_type,
                                   ensemble=ensemble,
                                   output_csv=output_csv)
    elif scores == 'percentages':
        ranked_scores, ranked_truths, ranked_PIDs =\
            plot_ranked_percentages(estimator=estimator,
                                    pinfo=pinfo,
                                    label_type=label_type,
                                    ensemble=ensemble,
                                    output_csv=output_csv)
    else:
        message = ('{} is not a valid scoring method!').format(str(scores))
        raise WORCKeyError(message)

    if output_zip is not None:
        # Convert to lower to later on overcome matching errors
        ranked_PIDs = [i.lower() for i in ranked_PIDs]

        plot_ranked_images(pinfo=pinfo,
                           label_type=label_type,
                           images=images,
                           segmentations=segmentations,
                           ranked_truths=ranked_truths,
                           ranked_scores=ranked_scores,
                           ranked_PIDs=ranked_PIDs,
                           output_zip=output_zip)

def convert_radiomix_features(input_file, output_folder):
    '''
    Convert .xlsx from RadiomiX to WORC compatible .hdf5 format

    Input:
    --------------

    input_file: .xlsx in which the feature are stored.
    output_folder: folder in which features are stored
    '''

    print('Converting .xlsx from RadiomiX to WORC compatible .hdf5 format...')
    # Check if output folder exists: otherwise create
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)

    # Read the input file and extract relevant fields
    f = pd.read_excel(input_file)
    pids = f.values[:, 4]
    segs = f.values[:, 5]
    features = f.values[:, 10:]

    # Read the feature labels, and rename them according to the group they belong to
    feature_labels = list(f.keys()[10:])
    for i in range(0, len(feature_labels)):
        l = feature_labels[i]
        if any(l.startswith(j) for j in texture_features):
            # Texture feature
            feature_labels[i] = 'tf_' + 'RadiomiX_' + l
        elif any(l.startswith(j) for j in ['IH_', 'Stats_']):
            # Histogram feature
            feature_labels[i] = 'hf_' + 'RadiomiX_' + l
        elif l.startswith('Shape_'):
            # Shape feature
            feature_labels[i] = 'sf_' + 'RadiomiX_' + l
        elif l.startswith('LoG_'):
            # LoG feature
            feature_labels[i] = 'logf_' + 'RadiomiX_' + l
        elif l.startswith('Fractal_'):
            # Fractal feature
            feature_labels[i] = 'fracf_' + 'RadiomiX_' + l
        elif l.startswith('LocInt_'):
            # Location feature
            feature_labels[i] = 'locf_' + 'RadiomiX_' + l
        elif l.startswith('RGRD_'):
            # RGRD feature
            feature_labels[i] = 'rgrdf_' + 'RadiomiX_' + l
        elif l.startswith('Wavelet_'):
            # RGRD feature
            feature_labels[i] = 'waveletf_' + 'RadiomiX_' + l
        else:
            raise WORCKeyError(f'Unknown feature {l}.')

    # Initiate labels for pandas file
    panda_labels = ['feature_values', 'feature_labels']

    # For each patient, convert features
    for i_patient in range(0, len(pids)):
        feature_values = features[i_patient, :].tolist()

        # Make an output folder per patient, remove invalid symbols.
        output = pids[i_patient] + segs[i_patient]
        output = output.replace(' ', '_')
        output = output.replace('(', '_')
        output = output.replace(')', '_')
        output = os.path.join(output_folder, output)

        # Check if output folder exists: otherwise create
        if not os.path.exists(output):
            os.mkdir(output)

        output = os.path.join(output, 'features.hdf5')

        print(f'\t Writing {output}')

        # Convert to pandas Series and save as hdf5
        panda_data = pd.Series([feature_values, feature_labels],
                               index=panda_labels,
                               name='Image features')

        # Save the features to the .hdf5 file
        print('\t Saving image features')
        panda_data.to_hdf(output, 'image_features')