def plot_boxplot_features(features, label_data, config, output_zip, label_type=None, verbose=False): # Load variables from the config file config = config_io.load_config(config) # Create output folder if required if not os.path.exists(os.path.dirname(output_zip)): os.makedirs(os.path.dirname(output_zip)) if label_type is None: label_type = config['Labels']['label_names'] # Read and stack the features if verbose: print("Reading features and label data.") label_data, image_features =\ load_features(features, label_data, label_type) # Generate the actual boxplots generate_feature_boxplots(image_features, label_data, output_zip, verbose=verbose)
def ComBat(features_train_in, labels_train, config, features_train_out, features_test_in=None, labels_test=None, features_test_out=None, VarianceThreshold=True, scaler=False, logarithmic=False): """ Apply ComBat feature harmonization. Based on: https://github.com/Jfortin1/ComBatHarmonization """ # Load the config print('############################################################') print('# Initializing ComBat. #') print('############################################################\n') config = cio.load_config(config) excluded_features = config['ComBat']['excluded_features'] # If mod, than also load moderating labels if config['ComBat']['mod'][0] == '[]': label_names = config['ComBat']['batch'] else: label_names = config['ComBat']['batch'] + config['ComBat']['mod'] # Load the features for both training and testing, match with batch and mod parameters label_data_train, image_features_train =\ wio.load_features(features_train_in, patientinfo=labels_train, label_type=label_names) feature_labels = image_features_train[0][1] image_features_train = [i[0] for i in image_features_train] label_data_train['patient_IDs'] = list(label_data_train['patient_IDs']) # Exclude features if excluded_features: print(f'\t Excluding features containing: {excluded_features}') # Determine indices of excluded features included_feature_indices = [] excluded_feature_indices = [] for fnum, i in enumerate(feature_labels): if not any(e in i for e in excluded_features): included_feature_indices.append(fnum) else: excluded_feature_indices.append(fnum) # Actually exclude the features image_features_train_combat = [ np.asarray(i)[included_feature_indices].tolist() for i in image_features_train ] feature_labels_combat = np.asarray( feature_labels)[included_feature_indices].tolist() image_features_train_noncombat = [ np.asarray(i)[excluded_feature_indices].tolist() for i in image_features_train ] feature_labels_noncombat = np.asarray( feature_labels)[excluded_feature_indices].tolist() else: image_features_train_combat = image_features_train feature_labels_combat = feature_labels.tolist() image_features_train_noncombat = [] feature_labels_noncombat = [] # Detect NaNs, otherwise first feature imputation is required if any( np.isnan(a) for a in np.asarray(image_features_train_combat).flatten()): print('\t [WARNING] NaNs detected, applying median imputation') imputer = Imputer(missing_values=np.nan, strategy='median') imputer.fit(image_features_train_combat) image_features_train_combat = imputer.transform( image_features_train_combat) else: imputer = None # Apply a scaler to the features if scaler: print('\t Fitting scaler on dataset.') scaler = StandardScaler().fit(image_features_train_combat) image_features_train_combat = scaler.transform( image_features_train_combat) # Remove features with a constant value if VarianceThreshold: print(f'\t Applying variance threshold on dataset.') image_features_train_combat, feature_labels_combat, VarSel =\ selfeat_variance(image_features_train_combat, np.asarray([feature_labels_combat])) feature_labels_combat = feature_labels_combat[0].tolist() if features_test_in: label_data_test, image_features_test =\ wio.load_features(features_test_in, patientinfo=labels_test, label_type=label_names) image_features_test = [i[0] for i in image_features_test] label_data_test['patient_IDs'] = list(label_data_test['patient_IDs']) if excluded_features: image_features_test_combat = [ np.asarray(i)[included_feature_indices].tolist() for i in image_features_test ] image_features_test_noncombat = [ np.asarray(i)[excluded_feature_indices].tolist() for i in image_features_test ] else: image_features_test_combat = image_features_test image_features_test_noncombat = [] # Apply imputation if required if imputer is not None: image_features_test_combat = imputer.transform( image_features_test_combat) # Apply a scaler to the features if scaler: image_features_test_combat = scaler.transform( image_features_test_combat) # Remove features with a constant value if VarianceThreshold: image_features_test_combat = VarSel.transform( image_features_test_combat) all_features = image_features_train_combat.tolist( ) + image_features_test_combat.tolist() all_labels = list() for i in range(label_data_train['label'].shape[0]): all_labels.append(label_data_train['label'][i, :, 0].tolist() + label_data_test['label'][i, :, 0].tolist()) all_labels = np.asarray(all_labels) else: all_features = image_features_train_combat.tolist() all_labels = label_data_train['label'] # Convert data to a single array all_features_matrix = np.asarray(all_features) all_labels = np.squeeze(all_labels) # Apply logarithm if required if logarithmic: print('\t Taking log10 of features before applying ComBat.') all_features_matrix = np.log10(all_features_matrix) # Convert all_labels to dictionary if len(all_labels.shape) == 1: # No mod variables all_labels = {label_data_train['label_name'][0]: all_labels} else: all_labels = { k: v for k, v in zip(label_data_train['label_name'], all_labels) } # Split labels in batch and moderation labels bat = config['ComBat']['batch'] mod = config['ComBat']['mod'] print(f'\t Using batch variable {bat}, mod variables {mod}.') batch = [ all_labels[l] for l in all_labels.keys() if l in config['ComBat']['batch'] ] batch = batch[0] if config['ComBat']['mod'][0] == '[]': mod = None else: mod = [ all_labels[l] for l in all_labels.keys() if l in config['ComBat']['mod'] ] # Set parameters for output files parameters = { 'batch': config['ComBat']['batch'], 'mod': config['ComBat']['mod'], 'par': config['ComBat']['par'] } name = 'Image features: ComBat corrected' panda_labels = [ 'parameters', 'patient', 'feature_values', 'feature_labels' ] feature_labels = feature_labels_combat + feature_labels_noncombat # Convert all inputs to arrays with right shape all_features_matrix = np.transpose(all_features_matrix) if mod is not None: mod = np.transpose(np.asarray(mod)) # Patients identified with batch -1.0 should be skipped skipname = 'Image features: ComBat skipped' ntrain = len(image_features_train_combat) ndel = 0 print(features_test_out) for bnum, b in enumerate(batch): bnum -= ndel if b == -1.0: if bnum < ntrain - ndel: # Training patient print('train') pid = label_data_train['patient_IDs'][bnum] out = features_train_out[bnum] # Combine ComBat and non-ComBat features feature_values_temp = list( all_features_matrix[:, bnum]) + list( image_features_train_noncombat[bnum]) # Delete patient for later processing del label_data_train['patient_IDs'][bnum] del image_features_train_noncombat[bnum] del features_train_out[bnum] image_features_train_combat = np.delete( image_features_train_combat, bnum, 0) else: # Test patient print('test') pid = label_data_test['patient_IDs'][bnum - ntrain] out = features_test_out[bnum - ntrain] # Combine ComBat and non-ComBat features feature_values_temp = list( all_features_matrix[:, bnum]) + list( image_features_test_noncombat[bnum - ntrain]) # Delete patient for later processing del label_data_test['patient_IDs'][bnum - ntrain] del image_features_test_noncombat[bnum - ntrain] del features_test_out[bnum - ntrain] image_features_test_combat = np.delete( image_features_test_combat, bnum - ntrain, 0) # Delete some other variables for later processing all_features_matrix = np.delete(all_features_matrix, bnum, 1) if mod is not None: mod = np.delete(mod, bnum, 0) batch = np.delete(batch, bnum, 0) # Notify user print( f'[WARNING] Skipping patient {pid} as batch variable is -1.0.') # Sort based on feature label feature_labels_temp, feature_values_temp =\ zip(*sorted(zip(feature_labels, feature_values_temp))) # Convert to pandas Series and save as hdf5 panda_data = pd.Series( [parameters, pid, feature_values_temp, feature_labels_temp], index=panda_labels, name=skipname) print(f'\t Saving image features to: {out}.') panda_data.to_hdf(out, 'image_features') ndel += 1 print(features_test_out) # Run ComBat in Matlab if config['ComBat']['language'] == 'matlab': print('\t Executing ComBat through Matlab') data_harmonized = ComBatMatlab( dat=all_features_matrix, batch=batch, command=config['ComBat']['matlab'], mod=mod, par=config['ComBat']['par'], per_feature=config['ComBat']['per_feature']) elif config['ComBat']['language'] == 'python': print('\t Executing ComBat through neuroComBat in Python') data_harmonized = ComBatPython( dat=all_features_matrix, batch=batch, mod=mod, eb=config['ComBat']['eb'], par=config['ComBat']['par'], per_feature=config['ComBat']['per_feature']) else: raise WORCKeyError(f"Language {config['ComBat']['language']} unknown.") # Convert values back if logarithm was used if logarithmic: data_harmonized = 10**data_harmonized # Convert again to train hdf5 files feature_values_train_combat = [ data_harmonized[:, i] for i in range(len(image_features_train_combat)) ] for fnum, i_feat in enumerate(feature_values_train_combat): # Combine ComBat and non-ComBat features feature_values_temp = i_feat.tolist( ) + image_features_train_noncombat[fnum] # Sort based on feature label feature_labels_temp, feature_values_temp =\ zip(*sorted(zip(feature_labels, feature_values_temp))) # Convert to pandas Series and save as hdf5 pid = label_data_train['patient_IDs'][fnum] panda_data = pd.Series( [parameters, pid, feature_values_temp, feature_labels_temp], index=panda_labels, name=name) print(f'Saving image features to: {features_train_out[fnum]}.') panda_data.to_hdf(features_train_out[fnum], 'image_features') # Repeat for testing if required if features_test_in: print(len(image_features_test_combat)) print(data_harmonized.shape[1]) feature_values_test_combat = [ data_harmonized[:, i] for i in range( data_harmonized.shape[1] - len(image_features_test_combat), data_harmonized.shape[1]) ] for fnum, i_feat in enumerate(feature_values_test_combat): print(fnum) # Combine ComBat and non-ComBat features feature_values_temp = i_feat.tolist( ) + image_features_test_noncombat[fnum] # Sort based on feature label feature_labels_temp, feature_values_temp =\ zip(*sorted(zip(feature_labels, feature_values_temp))) # Convert to pandas Series and save as hdf5 pid = label_data_test['patient_IDs'][fnum] panda_data = pd.Series( [parameters, pid, feature_values_temp, feature_labels_temp], index=panda_labels, name=name) print(f'Saving image features to: {features_test_out[fnum]}.') panda_data.to_hdf(features_test_out[fnum], 'image_features')
def trainclassifier(feat_train, patientinfo_train, config, output_hdf, feat_test=None, patientinfo_test=None, fixedsplits=None, verbose=True): """Train a classifier using machine learning from features. By default, if no split in training and test is supplied, a cross validation will be performed. Parameters ---------- feat_train: string, mandatory contains the paths to all .hdf5 feature files used. modalityname1=file1,file2,file3,... modalityname2=file1,... Thus, modalities names are always between a space and a equal sign, files are split by commas. We assume that the lists of files for each modality has the same length. Files on the same position on each list should belong to the same patient. patientinfo: string, mandatory Contains the path referring to a .txt file containing the patient label(s) and value(s) to be used for learning. See the Github Wiki for the format. config: string, mandatory path referring to a .ini file containing the parameters used for feature extraction. See the Github Wiki for the possible fields and their description. output_hdf: string, mandatory path refering to a .hdf5 file to which the final classifier and it's properties will be written to. feat_test: string, optional When this argument is supplied, the machine learning will not be trained using a cross validation, but rather using a fixed training and text split. This field should contain paths of the test set feature files, similar to the feat_train argument. patientinfo_test: string, optional When feat_test is supplied, you can supply optionally a patient label file through which the performance will be evaluated. fixedsplits: string, optional By default, random split cross validation is used to train and evaluate the machine learning methods. Optionally, you can provide a .xlsx file containing fixed splits to be used. See the Github Wiki for the format. verbose: boolean, default True print final feature values and labels to command line or not. """ # Convert inputs from lists to strings if type(patientinfo_train) is list: patientinfo_train = ''.join(patientinfo_train) if type(patientinfo_test) is list: patientinfo_test = ''.join(patientinfo_test) if type(config) is list: if len(config) == 1: config = ''.join(config) else: # FIXME print( '[WORC Warning] You provided multiple configuration files: only the first one will be used!' ) config = config[0] if type(output_hdf) is list: if len(output_hdf) == 1: output_hdf = ''.join(output_hdf) else: # FIXME print( '[WORC Warning] You provided multiple output hdf files: only the first one will be used!' ) output_hdf = output_hdf[0] if type(fixedsplits) is list: fixedsplits = ''.join(fixedsplits) # Load variables from the config file config = config_io.load_config(config) label_type = config['Labels']['label_names'] modus = config['Labels']['modus'] # Load the feature files and match to label data label_data_train, image_features_train =\ load_features(feat_train, patientinfo_train, label_type) if feat_test: label_data_test, image_features_test =\ load_features(feat_test, patientinfo_test, label_type) # Create tempdir name from patientinfo file name basename = os.path.basename(patientinfo_train) filename, _ = os.path.splitext(basename) path = patientinfo_train for i in range(4): # Use temp dir: result -> sample# -> parameters - > temppath path = os.path.dirname(path) _, path = os.path.split(path) path = os.path.join(path, 'trainclassifier', filename) # Construct the required classifier grid param_grid = cc.create_param_grid(config) # Add non-classifier parameters param_grid = add_parameters_to_grid(param_grid, config) # For N_iter, perform k-fold crossvalidation outputfolder = os.path.dirname(output_hdf) if feat_test is None: trained_classifier = cv.crossval( config, label_data_train, image_features_train, param_grid, modus=modus, use_fastr=config['Classification']['fastr'], fastr_plugin=config['Classification']['fastr_plugin'], fixedsplits=fixedsplits, ensemble=config['Ensemble'], outputfolder=outputfolder, tempsave=config['General']['tempsave']) else: trained_classifier = cv.nocrossval( config, label_data_train, label_data_test, image_features_train, image_features_test, param_grid, modus=modus, use_fastr=config['Classification']['fastr'], fastr_plugin=config['Classification']['fastr_plugin'], ensemble=config['Ensemble']) if not os.path.exists(os.path.dirname(output_hdf)): os.makedirs(os.path.dirname(output_hdf)) trained_classifier.to_hdf(output_hdf, 'EstimatorData') print("Saved data!")
def StatisticalTestFeatures(features, patientinfo, config, output_csv=None, output_png=None, output_tex=None, plot_test='MWU', Bonferonni=True, fontsize='small', yspacing=1, threshold=0.05, verbose=True, label_type=None): """Perform several statistical tests on features, such as a student t-test. Parameters ---------- features: string, mandatory contains the paths to all .hdf5 feature files used. modalityname1=file1,file2,file3,... modalityname2=file1,... Thus, modalities names are always between a space and a equal sign, files are split by commas. We assume that the lists of files for each modality has the same length. Files on the same position on each list should belong to the same patient. patientinfo: string, mandatory Contains the path referring to a .txt file containing the patient label(s) and value(s) to be used for learning. See the Github Wiki for the format. config: string, mandatory path referring to a .ini file containing the parameters used for feature extraction. See the Github Wiki for the possible fields and their description. # TODO: outputs verbose: boolean, default True print final feature values and labels to command line or not. """ # Load variables from the config file config = config_io.load_config(config) if type(patientinfo) is list: patientinfo = ''.join(patientinfo) if type(config) is list: config = ''.join(config) if type(output_csv) is list: output_csv = ''.join(output_csv) if type(output_png) is list: output_png = ''.join(output_png) if type(output_tex) is list: output_tex = ''.join(output_tex) print(output_png, output_tex) # Create output folder if required if not os.path.exists(os.path.dirname(output_csv)): os.makedirs(os.path.dirname(output_csv)) if label_type is None: label_type = config['Labels']['label_names'] # Read the features and classification data print("Reading features and label data.") label_data, image_features =\ load_features(features, patientinfo, label_type) # Extract feature labels and put values in an array feature_labels = image_features[0][1] feature_values = np.zeros([len(image_features), len(feature_labels)]) for num, x in enumerate(image_features): feature_values[num, :] = x[0] # ----------------------------------------------------------------------- # Perform statistical tests print("Performing statistical tests.") label_value = label_data['label'] label_name = label_data['label_name'] header = list() subheader = list() for i_name in label_name: header.append(str(i_name[0])) header.append('') header.append('') header.append('') header.append('') header.append('') subheader.append('Label') subheader.append('Ttest') subheader.append('Welch') subheader.append('Wilcoxon') subheader.append('Mann-Whitney') subheader.append('Chi2') subheader.append('') # Open the output_csv file if output_csv is not None: myfile = open(output_csv, 'w') wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(header) wr.writerow(subheader) savedict = dict() for i_class, i_name in zip(label_value, label_name): savedict[i_name[0]] = dict() pvalues = list() pvalueswelch = list() pvalueswil = list() pvaluesmw = list() pvalueschi2 = list() classlabels = i_class.ravel() for num, fl in enumerate(feature_labels): fv = feature_values[:, num] # Remove NaN values fv = fv[~np.isnan(fv)] class1 = [i for j, i in enumerate(fv) if classlabels[j] == 1] class2 = [i for j, i in enumerate(fv) if classlabels[j] == 0] pvalues.append(ttest_ind(class1, class2)[1]) pvalueswelch.append(ttest_ind(class1, class2, equal_var=False)[1]) pvalueswil.append(ranksums(class1, class2)[1]) try: pvaluesmw.append(mannwhitneyu(class1, class2)[1]) except ValueError as e: print("[WORC Warning] " + str(e) + '. Replacing metric value by NaN.') pvaluesmw.append(np.nan) # Optional: perform chi2 test. Only do this when categorical, which we define as less than 20 options. unique_values = list(set(fv)) unique_values.sort() if len(unique_values) == 1: print("[WORC Warning] " + fl + " has only one value. Replacing chi2 metric value by NaN.") pvalueschi2.append(np.nan) elif len(unique_values) <= 20: class1_count = [class1.count(i) for i in unique_values] class2_count = [class2.count(i) for i in unique_values] obs = np.array([class1_count, class2_count]) _, p, _, _ = chi2_contingency(obs) pvalueschi2.append(p) else: print("[WORC Warning] " + fl + " is no categorical variable. Replacing chi2 metric value by NaN.") pvalueschi2.append(np.nan) # Sort based on p-values: indices = np.argsort(np.asarray(pvaluesmw)) feature_labels_o = np.asarray(feature_labels)[indices].tolist() pvalues = np.asarray(pvalues)[indices].tolist() pvalueswelch = np.asarray(pvalueswelch)[indices].tolist() pvalueswil = np.asarray(pvalueswil)[indices].tolist() pvaluesmw = np.asarray(pvaluesmw)[indices].tolist() pvalueschi2 = np.asarray(pvalueschi2)[indices].tolist() savedict[i_name[0]]['ttest'] = pvalues savedict[i_name[0]]['welch'] = pvalueswelch savedict[i_name[0]]['wil'] = pvalueswil savedict[i_name[0]]['mw'] = pvaluesmw savedict[i_name[0]]['chi2'] = pvalueschi2 savedict[i_name[0]]['labels'] = feature_labels_o if output_csv is not None: for num in range(0, len(savedict[i_name[0]]['ttest'])): writelist = list() for i_name in savedict.keys(): labeldict = savedict[i_name] writelist.append(labeldict['labels'][num]) writelist.append(labeldict['ttest'][num]) writelist.append(labeldict['welch'][num]) writelist.append(labeldict['wil'][num]) writelist.append(labeldict['mw'][num]) writelist.append(labeldict['chi2'][num]) writelist.append('') wr.writerow(writelist) print("Saved data to CSV!") if output_png is not None or output_tex is not None: # Initialize objects objects_temp = labeldict['labels'] if plot_test == 'MWU': p_values_temp = labeldict['mw'] # remove the nan objects = list() p_values = list() for o, p in zip(objects_temp, p_values_temp): if not np.isnan(p): objects.append(o) p_values.append(p) # Debug defaults if DebugDetector().do_detection(): # No correction Bonferonni = False # Just select ~10 features sorted_p = p_values[:] sorted_p.sort() threshold = sorted_p[10] if Bonferonni: # Apply Bonferonni correction for multiple testing threshold = threshold / len(p_values) # Create labels labels = list() mapping = {0: 'Histogram', 1: 'Shape', 2: 'Orientation', 3: 'GLCM', 4: 'GLRLM', 5: 'GLSZM', 6: 'GLDM', 7: 'NGTDM', 8: 'Gabor', 9: 'Semantic', 10: 'DICOM', 11: 'LoG', 12: 'Vessel', 13: 'LBP', 14: 'Phase' } for o in objects: if 'hf_' in o: labels.append(0) elif 'sf_' in o: labels.append(1) elif 'of_' in o: labels.append(2) elif 'GLCM_' in o or 'GLCMMS_' in o: labels.append(3) elif 'GLRLM_' in o: labels.append(4) elif 'GLSZM_' in o: labels.append(5) elif 'GLDM_' in o: labels.append(6) elif 'NGTDM_' in o: labels.append(7) elif 'Gabor_' in o: labels.append(8) elif 'semf_' in o: labels.append(9) elif 'dicomf_' in o: labels.append(10) elif 'LoG_' in o: labels.append(11) elif 'vf_' in o: labels.append(12) elif 'LBP_' in o: labels.append(13) elif 'phasef_' in o: labels.append(14) else: raise KeyError(o) # Replace several labels objects = [o.replace('CalcFeatures_', '') for o in objects] objects = [o.replace('featureconverter_', '') for o in objects] objects = [o.replace('PREDICT_', '') for o in objects] objects = [o.replace('PyRadiomics_', '') for o in objects] objects = [o.replace('Pyradiomics_', '') for o in objects] objects = [o.replace('predict_', '') for o in objects] objects = [o.replace('pyradiomics_', '') for o in objects] objects = [o.replace('original_', '') for o in objects] objects = [o.replace('train_', '') for o in objects] objects = [o.replace('test_', '') for o in objects] objects = [o.replace('1_0_', '') for o in objects] objects = [o.replace('hf_', '') for o in objects] objects = [o.replace('sf_', '') for o in objects] objects = [o.replace('of_', '') for o in objects] objects = [o.replace('GLCM_', '') for o in objects] objects = [o.replace('GLCMMS_', '') for o in objects] objects = [o.replace('GLRLM_', '') for o in objects] objects = [o.replace('GLSZM_', '') for o in objects] objects = [o.replace('GLDM_', '') for o in objects] objects = [o.replace('NGTDM_', '') for o in objects] objects = [o.replace('Gabor_', '') for o in objects] objects = [o.replace('semf_', '') for o in objects] objects = [o.replace('dicomf_', '') for o in objects] objects = [o.replace('LoG_', '') for o in objects] objects = [o.replace('vf_', '') for o in objects] objects = [o.replace('LBP_', '') for o in objects] objects = [o.replace('phasef_', '') for o in objects] objects = [o.replace('tf_', '') for o in objects] objects = [o.replace('_CT_0', '') for o in objects] objects = [o.replace('_MR_0', '') for o in objects] objects = [o.replace('CT_0', '') for o in objects] objects = [o.replace('MR_0', '') for o in objects] # Sort based on labels sort_indices = np.argsort(np.asarray(labels)) p_values = [p_values[i] for i in sort_indices] labels = [labels[i] for i in sort_indices] # Make manhattan plot manhattan_importance(values=p_values, labels=labels, output_png=output_png, feature_labels=objects, threshold_annotated=threshold, mapping=mapping, output_tex=output_tex) return savedict
def Decomposition(features, patientinfo, config, output, label_type=None, verbose=True): """ Perform decompositions to two components of the feature space. Useage is similar to StatisticalTestFeatures. Parameters ---------- features: string, mandatory contains the paths to all .hdf5 feature files used. modalityname1=file1,file2,file3,... modalityname2=file1,... Thus, modalities names are always between a space and a equal sign, files are split by commas. We assume that the lists of files for each modality has the same length. Files on the same position on each list should belong to the same patient. patientinfo: string, mandatory Contains the path referring to a .txt file containing the patient label(s) and value(s) to be used for learning. See the Github Wiki for the format. config: string, mandatory path referring to a .ini file containing the parameters used for feature extraction. See the Github Wiki for the possible fields and their description. # TODO: outputs verbose: boolean, default True print final feature values and labels to command line or not. """ # Load variables from the config file config = config_io.load_config(config) # Create output folder if required if not os.path.exists(os.path.dirname(output)): os.makedirs(os.path.dirname(output)) if label_type is None: label_type = config['Labels']['label_names'] # Read the features and classification data print("Reading features and label data.") label_data, image_features =\ load_features(features, patientinfo, label_type) # Extract feature labels and put values in an array feature_labels = image_features[0][1] feature_values = np.zeros([len(image_features), len(feature_labels)]) for num, x in enumerate(image_features): feature_values[num, :] = x[0] # Detect NaNs, otherwise first feature imputation is required if any(np.isnan(a) for a in np.asarray(feature_values).flatten()): print('\t [WARNING] NaNs detected, applying median imputation') imputer = Imputer(missing_values=np.nan, strategy='median') imputer.fit(feature_values) feature_values = imputer.transform(feature_values) # ----------------------------------------------------------------------- # Perform decomposition print("Performing decompositions.") label_value = label_data['label'] label_name = label_data['label_name'] # Reduce to two components for plotting n_components = 2 for i_class, i_name in zip(label_value, label_name): classlabels = i_class.ravel() class1 = [ i for j, i in enumerate(feature_values) if classlabels[j] == 1 ] class2 = [ i for j, i in enumerate(feature_values) if classlabels[j] == 0 ] f = plt.figure(figsize=(20, 15)) # ------------------------------------------------------- # Fit PCA pca = PCA(n_components=n_components) pca.fit(feature_values) explained_variance_ratio = np.sum(pca.explained_variance_ratio_) class1_pca = pca.transform(class1) class2_pca = pca.transform(class2) # Plot PCA ax = plt.subplot(2, 3, 1) plt.subplots_adjust(hspace=0.3, wspace=0.2) ax.scatter(class1_pca[:, 0], class1_pca[:, 1], color='blue') ax.scatter(class2_pca[:, 0], class2_pca[:, 1], color='green') ax.set_title(f'PCA: {round(explained_variance_ratio, 3)} variance.') # ------------------------------------------------------- # Fit Sparse PCA pca = SparsePCA(n_components=n_components) pca.fit(feature_values) class1_pca = pca.transform(class1) class2_pca = pca.transform(class2) # Plot Sparse PCA ax = plt.subplot(2, 3, 2) plt.subplots_adjust(hspace=0.3, wspace=0.2) ax.scatter(class1_pca[:, 0], class1_pca[:, 1], color='blue') ax.scatter(class2_pca[:, 0], class2_pca[:, 1], color='green') ax.set_title('Sparse PCA.') # ------------------------------------------------------- # Fit Kernel PCA fnum = 3 for kernel in ['linear', 'poly', 'rbf']: pca = KernelPCA(n_components=n_components, kernel=kernel) pca.fit(feature_values) class1_pca = pca.transform(class1) class2_pca = pca.transform(class2) # Plot Sparse PCA ax = plt.subplot(2, 3, fnum) plt.subplots_adjust(hspace=0.3, wspace=0.2) ax.scatter(class1_pca[:, 0], class1_pca[:, 1], color='blue') ax.scatter(class2_pca[:, 0], class2_pca[:, 1], color='green') ax.set_title(('Kernel PCA: {} .').format(kernel)) fnum += 1 # ------------------------------------------------------- # Fit t-SNE tSNE = TSNE(n_components=n_components) class_all = class1 + class2 class_all_tsne = tSNE.fit_transform(class_all) class1_tSNE = class_all_tsne[0:len(class1)] class2_tSNE = class_all_tsne[len(class1):] # Plot Sparse tSNE ax = plt.subplot(2, 3, 6) plt.subplots_adjust(hspace=0.3, wspace=0.2) ax.scatter(class1_tSNE[:, 0], class1_tSNE[:, 1], color='blue') ax.scatter(class2_tSNE[:, 0], class2_tSNE[:, 1], color='green') ax.set_title('t-SNE.') # ------------------------------------------------------- # Maximize figure to get correct spacings # mng = plt.get_current_fig_manager() # mng.resize(*mng.window.maxsize()) # High DTI to make sure we save the maximized image f.savefig(output, dpi=600) print(("Decomposition saved as {} !").format(output))