예제 #1
0
def plot_boxplot_features(features,
                          label_data,
                          config,
                          output_zip,
                          label_type=None,
                          verbose=False):
    # Load variables from the config file
    config = config_io.load_config(config)

    # Create output folder if required
    if not os.path.exists(os.path.dirname(output_zip)):
        os.makedirs(os.path.dirname(output_zip))

    if label_type is None:
        label_type = config['Labels']['label_names']

    # Read and stack the features
    if verbose:
        print("Reading features and label data.")
    label_data, image_features =\
        load_features(features, label_data, label_type)

    # Generate the actual boxplots
    generate_feature_boxplots(image_features,
                              label_data,
                              output_zip,
                              verbose=verbose)
예제 #2
0
def ComBat(features_train_in,
           labels_train,
           config,
           features_train_out,
           features_test_in=None,
           labels_test=None,
           features_test_out=None,
           VarianceThreshold=True,
           scaler=False,
           logarithmic=False):
    """
    Apply ComBat feature harmonization.

    Based on: https://github.com/Jfortin1/ComBatHarmonization
    """
    # Load the config
    print('############################################################')
    print('#                    Initializing ComBat.                  #')
    print('############################################################\n')
    config = cio.load_config(config)
    excluded_features = config['ComBat']['excluded_features']

    # If mod, than also load moderating labels
    if config['ComBat']['mod'][0] == '[]':
        label_names = config['ComBat']['batch']
    else:
        label_names = config['ComBat']['batch'] + config['ComBat']['mod']

    # Load the features for both training and testing, match with batch and mod parameters
    label_data_train, image_features_train =\
        wio.load_features(features_train_in, patientinfo=labels_train,
                          label_type=label_names)

    feature_labels = image_features_train[0][1]
    image_features_train = [i[0] for i in image_features_train]
    label_data_train['patient_IDs'] = list(label_data_train['patient_IDs'])

    # Exclude features
    if excluded_features:
        print(f'\t Excluding features containing: {excluded_features}')
        # Determine indices of excluded features
        included_feature_indices = []
        excluded_feature_indices = []
        for fnum, i in enumerate(feature_labels):
            if not any(e in i for e in excluded_features):
                included_feature_indices.append(fnum)
            else:
                excluded_feature_indices.append(fnum)

        # Actually exclude the features
        image_features_train_combat = [
            np.asarray(i)[included_feature_indices].tolist()
            for i in image_features_train
        ]
        feature_labels_combat = np.asarray(
            feature_labels)[included_feature_indices].tolist()

        image_features_train_noncombat = [
            np.asarray(i)[excluded_feature_indices].tolist()
            for i in image_features_train
        ]
        feature_labels_noncombat = np.asarray(
            feature_labels)[excluded_feature_indices].tolist()

    else:
        image_features_train_combat = image_features_train
        feature_labels_combat = feature_labels.tolist()

        image_features_train_noncombat = []
        feature_labels_noncombat = []

    # Detect NaNs, otherwise first feature imputation is required
    if any(
            np.isnan(a)
            for a in np.asarray(image_features_train_combat).flatten()):
        print('\t [WARNING] NaNs detected, applying median imputation')
        imputer = Imputer(missing_values=np.nan, strategy='median')
        imputer.fit(image_features_train_combat)
        image_features_train_combat = imputer.transform(
            image_features_train_combat)
    else:
        imputer = None

    # Apply a scaler to the features
    if scaler:
        print('\t Fitting scaler on dataset.')
        scaler = StandardScaler().fit(image_features_train_combat)
        image_features_train_combat = scaler.transform(
            image_features_train_combat)

    # Remove features with a constant value
    if VarianceThreshold:
        print(f'\t Applying variance threshold on dataset.')
        image_features_train_combat, feature_labels_combat, VarSel =\
            selfeat_variance(image_features_train_combat, np.asarray([feature_labels_combat]))
        feature_labels_combat = feature_labels_combat[0].tolist()

    if features_test_in:
        label_data_test, image_features_test =\
            wio.load_features(features_test_in, patientinfo=labels_test,
                              label_type=label_names)

        image_features_test = [i[0] for i in image_features_test]
        label_data_test['patient_IDs'] = list(label_data_test['patient_IDs'])

        if excluded_features:
            image_features_test_combat = [
                np.asarray(i)[included_feature_indices].tolist()
                for i in image_features_test
            ]
            image_features_test_noncombat = [
                np.asarray(i)[excluded_feature_indices].tolist()
                for i in image_features_test
            ]
        else:
            image_features_test_combat = image_features_test
            image_features_test_noncombat = []

        # Apply imputation if required
        if imputer is not None:
            image_features_test_combat = imputer.transform(
                image_features_test_combat)

        # Apply a scaler to the features
        if scaler:
            image_features_test_combat = scaler.transform(
                image_features_test_combat)

        # Remove features with a constant value
        if VarianceThreshold:
            image_features_test_combat = VarSel.transform(
                image_features_test_combat)

        all_features = image_features_train_combat.tolist(
        ) + image_features_test_combat.tolist()
        all_labels = list()
        for i in range(label_data_train['label'].shape[0]):
            all_labels.append(label_data_train['label'][i, :, 0].tolist() +
                              label_data_test['label'][i, :, 0].tolist())
        all_labels = np.asarray(all_labels)
    else:
        all_features = image_features_train_combat.tolist()
        all_labels = label_data_train['label']

    # Convert data to a single array
    all_features_matrix = np.asarray(all_features)
    all_labels = np.squeeze(all_labels)

    # Apply logarithm if required
    if logarithmic:
        print('\t Taking log10 of features before applying ComBat.')
        all_features_matrix = np.log10(all_features_matrix)

    # Convert all_labels to dictionary
    if len(all_labels.shape) == 1:
        # No mod variables
        all_labels = {label_data_train['label_name'][0]: all_labels}
    else:
        all_labels = {
            k: v
            for k, v in zip(label_data_train['label_name'], all_labels)
        }

    # Split labels in batch and moderation labels
    bat = config['ComBat']['batch']
    mod = config['ComBat']['mod']
    print(f'\t Using batch variable {bat}, mod variables {mod}.')
    batch = [
        all_labels[l] for l in all_labels.keys()
        if l in config['ComBat']['batch']
    ]
    batch = batch[0]
    if config['ComBat']['mod'][0] == '[]':
        mod = None
    else:
        mod = [
            all_labels[l] for l in all_labels.keys()
            if l in config['ComBat']['mod']
        ]

    # Set parameters for output files
    parameters = {
        'batch': config['ComBat']['batch'],
        'mod': config['ComBat']['mod'],
        'par': config['ComBat']['par']
    }
    name = 'Image features: ComBat corrected'
    panda_labels = [
        'parameters', 'patient', 'feature_values', 'feature_labels'
    ]
    feature_labels = feature_labels_combat + feature_labels_noncombat

    # Convert all inputs to arrays with right shape
    all_features_matrix = np.transpose(all_features_matrix)
    if mod is not None:
        mod = np.transpose(np.asarray(mod))

    # Patients identified with batch -1.0 should be skipped
    skipname = 'Image features: ComBat skipped'
    ntrain = len(image_features_train_combat)
    ndel = 0
    print(features_test_out)
    for bnum, b in enumerate(batch):
        bnum -= ndel
        if b == -1.0:
            if bnum < ntrain - ndel:
                # Training patient
                print('train')
                pid = label_data_train['patient_IDs'][bnum]
                out = features_train_out[bnum]

                # Combine ComBat and non-ComBat features
                feature_values_temp = list(
                    all_features_matrix[:, bnum]) + list(
                        image_features_train_noncombat[bnum])

                # Delete patient for later processing
                del label_data_train['patient_IDs'][bnum]
                del image_features_train_noncombat[bnum]
                del features_train_out[bnum]
                image_features_train_combat = np.delete(
                    image_features_train_combat, bnum, 0)

            else:
                # Test patient
                print('test')
                pid = label_data_test['patient_IDs'][bnum - ntrain]
                out = features_test_out[bnum - ntrain]

                # Combine ComBat and non-ComBat features
                feature_values_temp = list(
                    all_features_matrix[:, bnum]) + list(
                        image_features_test_noncombat[bnum - ntrain])

                # Delete patient for later processing
                del label_data_test['patient_IDs'][bnum - ntrain]
                del image_features_test_noncombat[bnum - ntrain]
                del features_test_out[bnum - ntrain]
                image_features_test_combat = np.delete(
                    image_features_test_combat, bnum - ntrain, 0)

            # Delete some other variables for later processing
            all_features_matrix = np.delete(all_features_matrix, bnum, 1)
            if mod is not None:
                mod = np.delete(mod, bnum, 0)
            batch = np.delete(batch, bnum, 0)

            # Notify user
            print(
                f'[WARNING] Skipping patient {pid} as batch variable is -1.0.')

            # Sort based on feature label
            feature_labels_temp, feature_values_temp =\
                zip(*sorted(zip(feature_labels, feature_values_temp)))

            # Convert to pandas Series and save as hdf5
            panda_data = pd.Series(
                [parameters, pid, feature_values_temp, feature_labels_temp],
                index=panda_labels,
                name=skipname)

            print(f'\t Saving image features to: {out}.')
            panda_data.to_hdf(out, 'image_features')

            ndel += 1

    print(features_test_out)
    # Run ComBat in Matlab
    if config['ComBat']['language'] == 'matlab':
        print('\t Executing ComBat through Matlab')
        data_harmonized = ComBatMatlab(
            dat=all_features_matrix,
            batch=batch,
            command=config['ComBat']['matlab'],
            mod=mod,
            par=config['ComBat']['par'],
            per_feature=config['ComBat']['per_feature'])

    elif config['ComBat']['language'] == 'python':
        print('\t Executing ComBat through neuroComBat in Python')
        data_harmonized = ComBatPython(
            dat=all_features_matrix,
            batch=batch,
            mod=mod,
            eb=config['ComBat']['eb'],
            par=config['ComBat']['par'],
            per_feature=config['ComBat']['per_feature'])
    else:
        raise WORCKeyError(f"Language {config['ComBat']['language']} unknown.")

    # Convert values back if logarithm was used
    if logarithmic:
        data_harmonized = 10**data_harmonized

    # Convert again to train hdf5 files
    feature_values_train_combat = [
        data_harmonized[:, i] for i in range(len(image_features_train_combat))
    ]
    for fnum, i_feat in enumerate(feature_values_train_combat):
        # Combine ComBat and non-ComBat features
        feature_values_temp = i_feat.tolist(
        ) + image_features_train_noncombat[fnum]

        # Sort based on feature label
        feature_labels_temp, feature_values_temp =\
            zip(*sorted(zip(feature_labels, feature_values_temp)))

        # Convert to pandas Series and save as hdf5
        pid = label_data_train['patient_IDs'][fnum]
        panda_data = pd.Series(
            [parameters, pid, feature_values_temp, feature_labels_temp],
            index=panda_labels,
            name=name)

        print(f'Saving image features to: {features_train_out[fnum]}.')
        panda_data.to_hdf(features_train_out[fnum], 'image_features')

    # Repeat for testing if required
    if features_test_in:
        print(len(image_features_test_combat))
        print(data_harmonized.shape[1])
        feature_values_test_combat = [
            data_harmonized[:, i] for i in range(
                data_harmonized.shape[1] -
                len(image_features_test_combat), data_harmonized.shape[1])
        ]
        for fnum, i_feat in enumerate(feature_values_test_combat):
            print(fnum)
            # Combine ComBat and non-ComBat features
            feature_values_temp = i_feat.tolist(
            ) + image_features_test_noncombat[fnum]

            # Sort based on feature label
            feature_labels_temp, feature_values_temp =\
                zip(*sorted(zip(feature_labels, feature_values_temp)))

            # Convert to pandas Series and save as hdf5
            pid = label_data_test['patient_IDs'][fnum]
            panda_data = pd.Series(
                [parameters, pid, feature_values_temp, feature_labels_temp],
                index=panda_labels,
                name=name)

            print(f'Saving image features to: {features_test_out[fnum]}.')
            panda_data.to_hdf(features_test_out[fnum], 'image_features')
예제 #3
0
def trainclassifier(feat_train,
                    patientinfo_train,
                    config,
                    output_hdf,
                    feat_test=None,
                    patientinfo_test=None,
                    fixedsplits=None,
                    verbose=True):
    """Train a classifier using machine learning from features.

    By default, if no
    split in training and test is supplied, a cross validation
    will be performed.

    Parameters
    ----------
    feat_train: string, mandatory
            contains the paths to all .hdf5 feature files used.
            modalityname1=file1,file2,file3,... modalityname2=file1,...
            Thus, modalities names are always between a space and a equal
            sign, files are split by commas. We assume that the lists of
            files for each modality has the same length. Files on the
            same position on each list should belong to the same patient.

    patientinfo: string, mandatory
            Contains the path referring to a .txt file containing the
            patient label(s) and value(s) to be used for learning. See
            the Github Wiki for the format.

    config: string, mandatory
            path referring to a .ini file containing the parameters
            used for feature extraction. See the Github Wiki for the possible
            fields and their description.

    output_hdf: string, mandatory
            path refering to a .hdf5 file to which the final classifier and
            it's properties will be written to.

    feat_test: string, optional
            When this argument is supplied, the machine learning will not be
            trained using a cross validation, but rather using a fixed training
            and text split. This field should contain paths of the test set
            feature files, similar to the feat_train argument.

    patientinfo_test: string, optional
            When feat_test is supplied, you can supply optionally a patient label
            file through which the performance will be evaluated.

    fixedsplits: string, optional
            By default, random split cross validation is used to train and
            evaluate the machine learning methods. Optionally, you can provide
            a .xlsx file containing fixed splits to be used. See the Github Wiki
            for the format.

    verbose: boolean, default True
            print final feature values and labels to command line or not.

    """
    # Convert inputs from lists to strings
    if type(patientinfo_train) is list:
        patientinfo_train = ''.join(patientinfo_train)

    if type(patientinfo_test) is list:
        patientinfo_test = ''.join(patientinfo_test)

    if type(config) is list:
        if len(config) == 1:
            config = ''.join(config)
        else:
            # FIXME
            print(
                '[WORC Warning] You provided multiple configuration files: only the first one will be used!'
            )
            config = config[0]

    if type(output_hdf) is list:
        if len(output_hdf) == 1:
            output_hdf = ''.join(output_hdf)
        else:
            # FIXME
            print(
                '[WORC Warning] You provided multiple output hdf files: only the first one will be used!'
            )
            output_hdf = output_hdf[0]

    if type(fixedsplits) is list:
        fixedsplits = ''.join(fixedsplits)

    # Load variables from the config file
    config = config_io.load_config(config)
    label_type = config['Labels']['label_names']
    modus = config['Labels']['modus']

    # Load the feature files and match to label data
    label_data_train, image_features_train =\
        load_features(feat_train, patientinfo_train, label_type)

    if feat_test:
        label_data_test, image_features_test =\
            load_features(feat_test, patientinfo_test, label_type)

    # Create tempdir name from patientinfo file name
    basename = os.path.basename(patientinfo_train)
    filename, _ = os.path.splitext(basename)
    path = patientinfo_train
    for i in range(4):
        # Use temp dir: result -> sample# -> parameters - > temppath
        path = os.path.dirname(path)

    _, path = os.path.split(path)
    path = os.path.join(path, 'trainclassifier', filename)

    # Construct the required classifier grid
    param_grid = cc.create_param_grid(config)

    # Add non-classifier parameters
    param_grid = add_parameters_to_grid(param_grid, config)

    # For N_iter, perform k-fold crossvalidation
    outputfolder = os.path.dirname(output_hdf)
    if feat_test is None:
        trained_classifier = cv.crossval(
            config,
            label_data_train,
            image_features_train,
            param_grid,
            modus=modus,
            use_fastr=config['Classification']['fastr'],
            fastr_plugin=config['Classification']['fastr_plugin'],
            fixedsplits=fixedsplits,
            ensemble=config['Ensemble'],
            outputfolder=outputfolder,
            tempsave=config['General']['tempsave'])
    else:
        trained_classifier = cv.nocrossval(
            config,
            label_data_train,
            label_data_test,
            image_features_train,
            image_features_test,
            param_grid,
            modus=modus,
            use_fastr=config['Classification']['fastr'],
            fastr_plugin=config['Classification']['fastr_plugin'],
            ensemble=config['Ensemble'])

    if not os.path.exists(os.path.dirname(output_hdf)):
        os.makedirs(os.path.dirname(output_hdf))

    trained_classifier.to_hdf(output_hdf, 'EstimatorData')

    print("Saved data!")
def StatisticalTestFeatures(features, patientinfo, config, output_csv=None,
                            output_png=None, output_tex=None, plot_test='MWU',
                            Bonferonni=True,
                            fontsize='small', yspacing=1,
                            threshold=0.05, verbose=True, label_type=None):
    """Perform several statistical tests on features, such as a student t-test.

    Parameters
    ----------
    features: string, mandatory
            contains the paths to all .hdf5 feature files used.
            modalityname1=file1,file2,file3,... modalityname2=file1,...
            Thus, modalities names are always between a space and a equal
            sign, files are split by commas. We assume that the lists of
            files for each modality has the same length. Files on the
            same position on each list should belong to the same patient.

    patientinfo: string, mandatory
            Contains the path referring to a .txt file containing the
            patient label(s) and value(s) to be used for learning. See
            the Github Wiki for the format.

    config: string, mandatory
            path referring to a .ini file containing the parameters
            used for feature extraction. See the Github Wiki for the possible
            fields and their description.

    # TODO: outputs

    verbose: boolean, default True
            print final feature values and labels to command line or not.

    """
    # Load variables from the config file
    config = config_io.load_config(config)

    if type(patientinfo) is list:
        patientinfo = ''.join(patientinfo)

    if type(config) is list:
        config = ''.join(config)

    if type(output_csv) is list:
        output_csv = ''.join(output_csv)

    if type(output_png) is list:
        output_png = ''.join(output_png)

    if type(output_tex) is list:
        output_tex = ''.join(output_tex)

    print(output_png, output_tex)
    # Create output folder if required
    if not os.path.exists(os.path.dirname(output_csv)):
        os.makedirs(os.path.dirname(output_csv))

    if label_type is None:
        label_type = config['Labels']['label_names']

    # Read the features and classification data
    print("Reading features and label data.")
    label_data, image_features =\
        load_features(features, patientinfo, label_type)

    # Extract feature labels and put values in an array
    feature_labels = image_features[0][1]
    feature_values = np.zeros([len(image_features), len(feature_labels)])
    for num, x in enumerate(image_features):
        feature_values[num, :] = x[0]

    # -----------------------------------------------------------------------
    # Perform statistical tests
    print("Performing statistical tests.")
    label_value = label_data['label']
    label_name = label_data['label_name']

    header = list()
    subheader = list()
    for i_name in label_name:
        header.append(str(i_name[0]))
        header.append('')
        header.append('')
        header.append('')
        header.append('')
        header.append('')

        subheader.append('Label')
        subheader.append('Ttest')
        subheader.append('Welch')
        subheader.append('Wilcoxon')
        subheader.append('Mann-Whitney')
        subheader.append('Chi2')
        subheader.append('')

    # Open the output_csv file
    if output_csv is not None:
        myfile = open(output_csv, 'w')
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow(header)
        wr.writerow(subheader)

    savedict = dict()
    for i_class, i_name in zip(label_value, label_name):
        savedict[i_name[0]] = dict()
        pvalues = list()
        pvalueswelch = list()
        pvalueswil = list()
        pvaluesmw = list()
        pvalueschi2 = list()
        classlabels = i_class.ravel()

        for num, fl in enumerate(feature_labels):
            fv = feature_values[:, num]

            # Remove NaN values
            fv = fv[~np.isnan(fv)]

            class1 = [i for j, i in enumerate(fv) if classlabels[j] == 1]
            class2 = [i for j, i in enumerate(fv) if classlabels[j] == 0]

            pvalues.append(ttest_ind(class1, class2)[1])
            pvalueswelch.append(ttest_ind(class1, class2, equal_var=False)[1])
            pvalueswil.append(ranksums(class1, class2)[1])
            try:
                pvaluesmw.append(mannwhitneyu(class1, class2)[1])
            except ValueError as e:
                print("[WORC Warning] " + str(e) + '. Replacing metric value by NaN.')
                pvaluesmw.append(np.nan)

            # Optional: perform chi2 test. Only do this when categorical, which we define as less than 20 options.
            unique_values = list(set(fv))
            unique_values.sort()
            if len(unique_values) == 1:
                print("[WORC Warning] " + fl + " has only one value. Replacing chi2 metric value by NaN.")
                pvalueschi2.append(np.nan)
            elif len(unique_values) <= 20:
                class1_count = [class1.count(i) for i in unique_values]
                class2_count = [class2.count(i) for i in unique_values]
                obs = np.array([class1_count, class2_count])

                _, p, _, _ = chi2_contingency(obs)
                pvalueschi2.append(p)
            else:
                print("[WORC Warning] " + fl + " is no categorical variable. Replacing chi2 metric value by NaN.")
                pvalueschi2.append(np.nan)

        # Sort based on p-values:
        indices = np.argsort(np.asarray(pvaluesmw))
        feature_labels_o = np.asarray(feature_labels)[indices].tolist()

        pvalues = np.asarray(pvalues)[indices].tolist()
        pvalueswelch = np.asarray(pvalueswelch)[indices].tolist()
        pvalueswil = np.asarray(pvalueswil)[indices].tolist()
        pvaluesmw = np.asarray(pvaluesmw)[indices].tolist()
        pvalueschi2 = np.asarray(pvalueschi2)[indices].tolist()

        savedict[i_name[0]]['ttest'] = pvalues
        savedict[i_name[0]]['welch'] = pvalueswelch
        savedict[i_name[0]]['wil'] = pvalueswil
        savedict[i_name[0]]['mw'] = pvaluesmw
        savedict[i_name[0]]['chi2'] = pvalueschi2
        savedict[i_name[0]]['labels'] = feature_labels_o

    if output_csv is not None:
        for num in range(0, len(savedict[i_name[0]]['ttest'])):
            writelist = list()
            for i_name in savedict.keys():
                labeldict = savedict[i_name]
                writelist.append(labeldict['labels'][num])
                writelist.append(labeldict['ttest'][num])
                writelist.append(labeldict['welch'][num])
                writelist.append(labeldict['wil'][num])
                writelist.append(labeldict['mw'][num])
                writelist.append(labeldict['chi2'][num])
                writelist.append('')

            wr.writerow(writelist)

        print("Saved data to CSV!")

    if output_png is not None or output_tex is not None:
        # Initialize objects
        objects_temp = labeldict['labels']
        if plot_test == 'MWU':
            p_values_temp = labeldict['mw']

        # remove the nan
        objects = list()
        p_values = list()
        for o, p in zip(objects_temp, p_values_temp):
            if not np.isnan(p):
                objects.append(o)
                p_values.append(p)

        # Debug defaults
        if DebugDetector().do_detection():
            # No correction
            Bonferonni = False

            # Just select ~10 features
            sorted_p = p_values[:]
            sorted_p.sort()
            threshold = sorted_p[10]

        if Bonferonni:
            # Apply Bonferonni correction for multiple testing
            threshold = threshold / len(p_values)

        # Create labels
        labels = list()
        mapping = {0: 'Histogram',
                   1: 'Shape',
                   2: 'Orientation',
                   3: 'GLCM',
                   4: 'GLRLM',
                   5: 'GLSZM',
                   6: 'GLDM',
                   7: 'NGTDM',
                   8: 'Gabor',
                   9: 'Semantic',
                   10: 'DICOM',
                   11: 'LoG',
                   12: 'Vessel',
                   13: 'LBP',
                   14: 'Phase'
                   }
        for o in objects:
            if 'hf_' in o:
                labels.append(0)
            elif 'sf_' in o:
                labels.append(1)
            elif 'of_' in o:
                labels.append(2)
            elif 'GLCM_' in o or 'GLCMMS_' in o:
                labels.append(3)
            elif 'GLRLM_' in o:
                labels.append(4)
            elif 'GLSZM_' in o:
                labels.append(5)
            elif 'GLDM_' in o:
                labels.append(6)
            elif 'NGTDM_' in o:
                labels.append(7)
            elif 'Gabor_' in o:
                labels.append(8)
            elif 'semf_' in o:
                labels.append(9)
            elif 'dicomf_' in o:
                labels.append(10)
            elif 'LoG_' in o:
                labels.append(11)
            elif 'vf_' in o:
                labels.append(12)
            elif 'LBP_' in o:
                labels.append(13)
            elif 'phasef_' in o:
                labels.append(14)
            else:
                raise KeyError(o)

        # Replace several labels
        objects = [o.replace('CalcFeatures_', '') for o in objects]
        objects = [o.replace('featureconverter_', '') for o in objects]
        objects = [o.replace('PREDICT_', '') for o in objects]
        objects = [o.replace('PyRadiomics_', '') for o in objects]
        objects = [o.replace('Pyradiomics_', '') for o in objects]
        objects = [o.replace('predict_', '') for o in objects]
        objects = [o.replace('pyradiomics_', '') for o in objects]
        objects = [o.replace('original_', '') for o in objects]
        objects = [o.replace('train_', '') for o in objects]
        objects = [o.replace('test_', '') for o in objects]
        objects = [o.replace('1_0_', '') for o in objects]
        objects = [o.replace('hf_', '') for o in objects]
        objects = [o.replace('sf_', '') for o in objects]
        objects = [o.replace('of_', '') for o in objects]
        objects = [o.replace('GLCM_', '') for o in objects]
        objects = [o.replace('GLCMMS_', '') for o in objects]
        objects = [o.replace('GLRLM_', '') for o in objects]
        objects = [o.replace('GLSZM_', '') for o in objects]
        objects = [o.replace('GLDM_', '') for o in objects]
        objects = [o.replace('NGTDM_', '') for o in objects]
        objects = [o.replace('Gabor_', '') for o in objects]
        objects = [o.replace('semf_', '') for o in objects]
        objects = [o.replace('dicomf_', '') for o in objects]
        objects = [o.replace('LoG_', '') for o in objects]
        objects = [o.replace('vf_', '') for o in objects]
        objects = [o.replace('LBP_', '') for o in objects]
        objects = [o.replace('phasef_', '') for o in objects]
        objects = [o.replace('tf_', '') for o in objects]
        objects = [o.replace('_CT_0', '') for o in objects]
        objects = [o.replace('_MR_0', '') for o in objects]
        objects = [o.replace('CT_0', '') for o in objects]
        objects = [o.replace('MR_0', '') for o in objects]

        # Sort based on labels
        sort_indices = np.argsort(np.asarray(labels))
        p_values = [p_values[i] for i in sort_indices]
        labels = [labels[i] for i in sort_indices]

        # Make manhattan plot
        manhattan_importance(values=p_values,
                             labels=labels,
                             output_png=output_png,
                             feature_labels=objects,
                             threshold_annotated=threshold,
                             mapping=mapping,
                             output_tex=output_tex)

    return savedict
def Decomposition(features,
                  patientinfo,
                  config,
                  output,
                  label_type=None,
                  verbose=True):
    """
    Perform decompositions to two components of the feature space.

    Useage is similar to StatisticalTestFeatures.

    Parameters
    ----------
    features: string, mandatory
            contains the paths to all .hdf5 feature files used.
            modalityname1=file1,file2,file3,... modalityname2=file1,...
            Thus, modalities names are always between a space and a equal
            sign, files are split by commas. We assume that the lists of
            files for each modality has the same length. Files on the
            same position on each list should belong to the same patient.

    patientinfo: string, mandatory
            Contains the path referring to a .txt file containing the
            patient label(s) and value(s) to be used for learning. See
            the Github Wiki for the format.

    config: string, mandatory
            path referring to a .ini file containing the parameters
            used for feature extraction. See the Github Wiki for the possible
            fields and their description.

    # TODO: outputs

    verbose: boolean, default True
            print final feature values and labels to command line or not.

    """
    # Load variables from the config file
    config = config_io.load_config(config)

    # Create output folder if required
    if not os.path.exists(os.path.dirname(output)):
        os.makedirs(os.path.dirname(output))

    if label_type is None:
        label_type = config['Labels']['label_names']

    # Read the features and classification data
    print("Reading features and label data.")
    label_data, image_features =\
        load_features(features, patientinfo, label_type)

    # Extract feature labels and put values in an array
    feature_labels = image_features[0][1]
    feature_values = np.zeros([len(image_features), len(feature_labels)])
    for num, x in enumerate(image_features):
        feature_values[num, :] = x[0]

    # Detect NaNs, otherwise first feature imputation is required
    if any(np.isnan(a) for a in np.asarray(feature_values).flatten()):
        print('\t [WARNING] NaNs detected, applying median imputation')
        imputer = Imputer(missing_values=np.nan, strategy='median')
        imputer.fit(feature_values)
        feature_values = imputer.transform(feature_values)

    # -----------------------------------------------------------------------
    # Perform decomposition
    print("Performing decompositions.")
    label_value = label_data['label']
    label_name = label_data['label_name']

    # Reduce to two components for plotting
    n_components = 2

    for i_class, i_name in zip(label_value, label_name):
        classlabels = i_class.ravel()

        class1 = [
            i for j, i in enumerate(feature_values) if classlabels[j] == 1
        ]
        class2 = [
            i for j, i in enumerate(feature_values) if classlabels[j] == 0
        ]

        f = plt.figure(figsize=(20, 15))

        # -------------------------------------------------------
        # Fit PCA
        pca = PCA(n_components=n_components)
        pca.fit(feature_values)
        explained_variance_ratio = np.sum(pca.explained_variance_ratio_)
        class1_pca = pca.transform(class1)
        class2_pca = pca.transform(class2)

        # Plot PCA
        ax = plt.subplot(2, 3, 1)

        plt.subplots_adjust(hspace=0.3, wspace=0.2)
        ax.scatter(class1_pca[:, 0], class1_pca[:, 1], color='blue')
        ax.scatter(class2_pca[:, 0], class2_pca[:, 1], color='green')
        ax.set_title(f'PCA: {round(explained_variance_ratio, 3)} variance.')

        # -------------------------------------------------------
        # Fit Sparse PCA
        pca = SparsePCA(n_components=n_components)
        pca.fit(feature_values)
        class1_pca = pca.transform(class1)
        class2_pca = pca.transform(class2)

        # Plot Sparse PCA
        ax = plt.subplot(2, 3, 2)

        plt.subplots_adjust(hspace=0.3, wspace=0.2)
        ax.scatter(class1_pca[:, 0], class1_pca[:, 1], color='blue')
        ax.scatter(class2_pca[:, 0], class2_pca[:, 1], color='green')
        ax.set_title('Sparse PCA.')

        # -------------------------------------------------------
        # Fit Kernel PCA
        fnum = 3
        for kernel in ['linear', 'poly', 'rbf']:
            pca = KernelPCA(n_components=n_components, kernel=kernel)
            pca.fit(feature_values)
            class1_pca = pca.transform(class1)
            class2_pca = pca.transform(class2)

            # Plot Sparse PCA
            ax = plt.subplot(2, 3, fnum)

            plt.subplots_adjust(hspace=0.3, wspace=0.2)
            ax.scatter(class1_pca[:, 0], class1_pca[:, 1], color='blue')
            ax.scatter(class2_pca[:, 0], class2_pca[:, 1], color='green')
            ax.set_title(('Kernel PCA: {} .').format(kernel))
            fnum += 1

        # -------------------------------------------------------
        # Fit t-SNE
        tSNE = TSNE(n_components=n_components)
        class_all = class1 + class2
        class_all_tsne = tSNE.fit_transform(class_all)

        class1_tSNE = class_all_tsne[0:len(class1)]
        class2_tSNE = class_all_tsne[len(class1):]

        # Plot Sparse tSNE
        ax = plt.subplot(2, 3, 6)

        plt.subplots_adjust(hspace=0.3, wspace=0.2)
        ax.scatter(class1_tSNE[:, 0], class1_tSNE[:, 1], color='blue')
        ax.scatter(class2_tSNE[:, 0], class2_tSNE[:, 1], color='green')
        ax.set_title('t-SNE.')

        # -------------------------------------------------------
        # Maximize figure to get correct spacings
        # mng = plt.get_current_fig_manager()
        # mng.resize(*mng.window.maxsize())

        # High DTI to  make sure we save the maximized image
        f.savefig(output, dpi=600)
        print(("Decomposition saved as {} !").format(output))