def custom_config_overrides(self, config): # First check if these overrides are in the actual WORC config dummy = WORC() defaultconfig = dummy.defaultconfig() for k in config.keys(): if k not in list(defaultconfig.keys()): raise WORCKeyError(f'Key "{k}" is not in the WORC config!.') # Check also sub config for k2 in config[k].keys(): if k2 not in list(defaultconfig[k].keys()): raise WORCKeyError(f'Key "{k2}" is not in part "{k}" of the WORC config!.') # Actually update _deep_update(self._custom_overrides, config)
def ComBat(features_train_in, labels_train, config, features_train_out, features_test_in=None, labels_test=None, features_test_out=None, VarianceThreshold=True, scaler=False, logarithmic=False): """ Apply ComBat feature harmonization. Based on: https://github.com/Jfortin1/ComBatHarmonization """ # Load the config print('############################################################') print('# Initializing ComBat. #') print('############################################################\n') config = cio.load_config(config) excluded_features = config['ComBat']['excluded_features'] # If mod, than also load moderating labels if config['ComBat']['mod'][0] == '[]': label_names = config['ComBat']['batch'] else: label_names = config['ComBat']['batch'] + config['ComBat']['mod'] # Load the features for both training and testing, match with batch and mod parameters label_data_train, image_features_train =\ wio.load_features(features_train_in, patientinfo=labels_train, label_type=label_names) feature_labels = image_features_train[0][1] image_features_train = [i[0] for i in image_features_train] label_data_train['patient_IDs'] = list(label_data_train['patient_IDs']) # Exclude features if excluded_features: print(f'\t Excluding features containing: {excluded_features}') # Determine indices of excluded features included_feature_indices = [] excluded_feature_indices = [] for fnum, i in enumerate(feature_labels): if not any(e in i for e in excluded_features): included_feature_indices.append(fnum) else: excluded_feature_indices.append(fnum) # Actually exclude the features image_features_train_combat = [ np.asarray(i)[included_feature_indices].tolist() for i in image_features_train ] feature_labels_combat = np.asarray( feature_labels)[included_feature_indices].tolist() image_features_train_noncombat = [ np.asarray(i)[excluded_feature_indices].tolist() for i in image_features_train ] feature_labels_noncombat = np.asarray( feature_labels)[excluded_feature_indices].tolist() else: image_features_train_combat = image_features_train feature_labels_combat = feature_labels.tolist() image_features_train_noncombat = [] feature_labels_noncombat = [] # Detect NaNs, otherwise first feature imputation is required if any( np.isnan(a) for a in np.asarray(image_features_train_combat).flatten()): print('\t [WARNING] NaNs detected, applying median imputation') imputer = Imputer(missing_values=np.nan, strategy='median') imputer.fit(image_features_train_combat) image_features_train_combat = imputer.transform( image_features_train_combat) else: imputer = None # Apply a scaler to the features if scaler: print('\t Fitting scaler on dataset.') scaler = StandardScaler().fit(image_features_train_combat) image_features_train_combat = scaler.transform( image_features_train_combat) # Remove features with a constant value if VarianceThreshold: print(f'\t Applying variance threshold on dataset.') image_features_train_combat, feature_labels_combat, VarSel =\ selfeat_variance(image_features_train_combat, np.asarray([feature_labels_combat])) feature_labels_combat = feature_labels_combat[0].tolist() if features_test_in: label_data_test, image_features_test =\ wio.load_features(features_test_in, patientinfo=labels_test, label_type=label_names) image_features_test = [i[0] for i in image_features_test] label_data_test['patient_IDs'] = list(label_data_test['patient_IDs']) if excluded_features: image_features_test_combat = [ np.asarray(i)[included_feature_indices].tolist() for i in image_features_test ] image_features_test_noncombat = [ np.asarray(i)[excluded_feature_indices].tolist() for i in image_features_test ] else: image_features_test_combat = image_features_test image_features_test_noncombat = [] # Apply imputation if required if imputer is not None: image_features_test_combat = imputer.transform( image_features_test_combat) # Apply a scaler to the features if scaler: image_features_test_combat = scaler.transform( image_features_test_combat) # Remove features with a constant value if VarianceThreshold: image_features_test_combat = VarSel.transform( image_features_test_combat) all_features = image_features_train_combat.tolist( ) + image_features_test_combat.tolist() all_labels = list() for i in range(label_data_train['label'].shape[0]): all_labels.append(label_data_train['label'][i, :, 0].tolist() + label_data_test['label'][i, :, 0].tolist()) all_labels = np.asarray(all_labels) else: all_features = image_features_train_combat.tolist() all_labels = label_data_train['label'] # Convert data to a single array all_features_matrix = np.asarray(all_features) all_labels = np.squeeze(all_labels) # Apply logarithm if required if logarithmic: print('\t Taking log10 of features before applying ComBat.') all_features_matrix = np.log10(all_features_matrix) # Convert all_labels to dictionary if len(all_labels.shape) == 1: # No mod variables all_labels = {label_data_train['label_name'][0]: all_labels} else: all_labels = { k: v for k, v in zip(label_data_train['label_name'], all_labels) } # Split labels in batch and moderation labels bat = config['ComBat']['batch'] mod = config['ComBat']['mod'] print(f'\t Using batch variable {bat}, mod variables {mod}.') batch = [ all_labels[l] for l in all_labels.keys() if l in config['ComBat']['batch'] ] batch = batch[0] if config['ComBat']['mod'][0] == '[]': mod = None else: mod = [ all_labels[l] for l in all_labels.keys() if l in config['ComBat']['mod'] ] # Set parameters for output files parameters = { 'batch': config['ComBat']['batch'], 'mod': config['ComBat']['mod'], 'par': config['ComBat']['par'] } name = 'Image features: ComBat corrected' panda_labels = [ 'parameters', 'patient', 'feature_values', 'feature_labels' ] feature_labels = feature_labels_combat + feature_labels_noncombat # Convert all inputs to arrays with right shape all_features_matrix = np.transpose(all_features_matrix) if mod is not None: mod = np.transpose(np.asarray(mod)) # Patients identified with batch -1.0 should be skipped skipname = 'Image features: ComBat skipped' ntrain = len(image_features_train_combat) ndel = 0 print(features_test_out) for bnum, b in enumerate(batch): bnum -= ndel if b == -1.0: if bnum < ntrain - ndel: # Training patient print('train') pid = label_data_train['patient_IDs'][bnum] out = features_train_out[bnum] # Combine ComBat and non-ComBat features feature_values_temp = list( all_features_matrix[:, bnum]) + list( image_features_train_noncombat[bnum]) # Delete patient for later processing del label_data_train['patient_IDs'][bnum] del image_features_train_noncombat[bnum] del features_train_out[bnum] image_features_train_combat = np.delete( image_features_train_combat, bnum, 0) else: # Test patient print('test') pid = label_data_test['patient_IDs'][bnum - ntrain] out = features_test_out[bnum - ntrain] # Combine ComBat and non-ComBat features feature_values_temp = list( all_features_matrix[:, bnum]) + list( image_features_test_noncombat[bnum - ntrain]) # Delete patient for later processing del label_data_test['patient_IDs'][bnum - ntrain] del image_features_test_noncombat[bnum - ntrain] del features_test_out[bnum - ntrain] image_features_test_combat = np.delete( image_features_test_combat, bnum - ntrain, 0) # Delete some other variables for later processing all_features_matrix = np.delete(all_features_matrix, bnum, 1) if mod is not None: mod = np.delete(mod, bnum, 0) batch = np.delete(batch, bnum, 0) # Notify user print( f'[WARNING] Skipping patient {pid} as batch variable is -1.0.') # Sort based on feature label feature_labels_temp, feature_values_temp =\ zip(*sorted(zip(feature_labels, feature_values_temp))) # Convert to pandas Series and save as hdf5 panda_data = pd.Series( [parameters, pid, feature_values_temp, feature_labels_temp], index=panda_labels, name=skipname) print(f'\t Saving image features to: {out}.') panda_data.to_hdf(out, 'image_features') ndel += 1 print(features_test_out) # Run ComBat in Matlab if config['ComBat']['language'] == 'matlab': print('\t Executing ComBat through Matlab') data_harmonized = ComBatMatlab( dat=all_features_matrix, batch=batch, command=config['ComBat']['matlab'], mod=mod, par=config['ComBat']['par'], per_feature=config['ComBat']['per_feature']) elif config['ComBat']['language'] == 'python': print('\t Executing ComBat through neuroComBat in Python') data_harmonized = ComBatPython( dat=all_features_matrix, batch=batch, mod=mod, eb=config['ComBat']['eb'], par=config['ComBat']['par'], per_feature=config['ComBat']['per_feature']) else: raise WORCKeyError(f"Language {config['ComBat']['language']} unknown.") # Convert values back if logarithm was used if logarithmic: data_harmonized = 10**data_harmonized # Convert again to train hdf5 files feature_values_train_combat = [ data_harmonized[:, i] for i in range(len(image_features_train_combat)) ] for fnum, i_feat in enumerate(feature_values_train_combat): # Combine ComBat and non-ComBat features feature_values_temp = i_feat.tolist( ) + image_features_train_noncombat[fnum] # Sort based on feature label feature_labels_temp, feature_values_temp =\ zip(*sorted(zip(feature_labels, feature_values_temp))) # Convert to pandas Series and save as hdf5 pid = label_data_train['patient_IDs'][fnum] panda_data = pd.Series( [parameters, pid, feature_values_temp, feature_labels_temp], index=panda_labels, name=name) print(f'Saving image features to: {features_train_out[fnum]}.') panda_data.to_hdf(features_train_out[fnum], 'image_features') # Repeat for testing if required if features_test_in: print(len(image_features_test_combat)) print(data_harmonized.shape[1]) feature_values_test_combat = [ data_harmonized[:, i] for i in range( data_harmonized.shape[1] - len(image_features_test_combat), data_harmonized.shape[1]) ] for fnum, i_feat in enumerate(feature_values_test_combat): print(fnum) # Combine ComBat and non-ComBat features feature_values_temp = i_feat.tolist( ) + image_features_test_noncombat[fnum] # Sort based on feature label feature_labels_temp, feature_values_temp =\ zip(*sorted(zip(feature_labels, feature_values_temp))) # Convert to pandas Series and save as hdf5 pid = label_data_test['patient_IDs'][fnum] panda_data = pd.Series( [parameters, pid, feature_values_temp, feature_labels_temp], index=panda_labels, name=name) print(f'Saving image features to: {features_test_out[fnum]}.') panda_data.to_hdf(features_test_out[fnum], 'image_features')
def plot_ranked_scores(estimator, pinfo, label_type, scores='percentages', images=[], segmentations=[], ensemble=50, output_csv=None, output_zip=None, output_itk=None): ''' Rank the patients according to their average score. The score can either be the average posterior or the percentage of times the patient was classified correctly in the cross validations. Additionally, the middle slice of each patient is plot and saved according to the ranking. Parameters ---------- estimator: filepath, mandatory Path pointing to the .hdf5 file which was is the output of the trainclassifier function. pinfo: filepath, mandatory Path pointint to the .txt file which contains the patient label information. label_type: string, default None The name of the label predicted by the estimator. If None, the first label from the prediction file will be used. scores: string, default percentages Type of scoring to be used. Either 'posteriors' or 'percentages'. images: list, optional List containing the filepaths to the ITKImage image files of the patients. segmentations: list, optional List containing the filepaths to the ITKImage segmentation files of the patients. ensemble: integer or string, optional Method to be used for ensembling. Either an integer for a fixed size or 'Caruana' for the Caruana method, see the SearchCV function for more details. output_csv: filepath, optional If given, the scores will be written to this csv file. output_zip: filepath, optional If given, the images will be plotted and the pngs saved to this zip file. output_itk: filepath, optional WIP ''' prediction = pd.read_hdf(estimator) if label_type is None: # Assume we want to have the first key label_type = prediction.keys()[0] if scores == 'posteriors': ranked_scores, ranked_truths, ranked_PIDs =\ plot_ranked_posteriors(estimator=estimator, pinfo=pinfo, label_type=label_type, ensemble=ensemble, output_csv=output_csv) elif scores == 'percentages': ranked_scores, ranked_truths, ranked_PIDs =\ plot_ranked_percentages(estimator=estimator, pinfo=pinfo, label_type=label_type, ensemble=ensemble, output_csv=output_csv) else: message = ('{} is not a valid scoring method!').format(str(scores)) raise WORCKeyError(message) if output_zip is not None or output_itk is not None: # Rerank the scores split per ground truth class: negative for 0, positive for 1 ranked_scores_temp = list() for l, p in zip(ranked_truths, ranked_scores): if l == 0: ranked_scores_temp.append(-p) else: ranked_scores_temp.append(p) ranked_scores = ranked_scores_temp ranking = np.argsort(ranked_scores) ranked_scores = [ranked_scores[r] for r in ranking] ranked_truths = [ranked_truths[r] for r in ranking] ranked_PIDs = [ranked_PIDs[r] for r in ranking] # Convert to lower to later on overcome matching errors ranked_PIDs = [i.lower() for i in ranked_PIDs] plot_ranked_images(pinfo=pinfo, label_type=label_type, images=images, segmentations=segmentations, ranked_truths=ranked_truths, ranked_scores=ranked_scores, ranked_PIDs=ranked_PIDs, output_zip=output_zip, output_itk=output_itk)
ranked_scores, ranked_truths, ranked_PIDs =\ plot_ranked_posteriors(estimator=estimator, pinfo=pinfo, label_type=label_type, ensemble=ensemble, output_csv=output_csv) elif scores == 'percentages': ranked_scores, ranked_truths, ranked_PIDs =\ plot_ranked_percentages(estimator=estimator, pinfo=pinfo, label_type=label_type, ensemble=ensemble, output_csv=output_csv) else: message = ('{} is not a valid scoring method!').format(str(scores)) raise WORCKeyError(message) if output_zip is not None: # Convert to lower to later on overcome matching errors ranked_PIDs = [i.lower() for i in ranked_PIDs] plot_ranked_images(pinfo=pinfo, label_type=label_type, images=images, segmentations=segmentations, ranked_truths=ranked_truths, ranked_scores=ranked_scores, ranked_PIDs=ranked_PIDs, output_zip=output_zip)
def convert_radiomix_features(input_file, output_folder): ''' Convert .xlsx from RadiomiX to WORC compatible .hdf5 format Input: -------------- input_file: .xlsx in which the feature are stored. output_folder: folder in which features are stored ''' print('Converting .xlsx from RadiomiX to WORC compatible .hdf5 format...') # Check if output folder exists: otherwise create if not os.path.exists(output_folder): os.mkdir(output_folder) # Read the input file and extract relevant fields f = pd.read_excel(input_file) pids = f.values[:, 4] segs = f.values[:, 5] features = f.values[:, 10:] # Read the feature labels, and rename them according to the group they belong to feature_labels = list(f.keys()[10:]) for i in range(0, len(feature_labels)): l = feature_labels[i] if any(l.startswith(j) for j in texture_features): # Texture feature feature_labels[i] = 'tf_' + 'RadiomiX_' + l elif any(l.startswith(j) for j in ['IH_', 'Stats_']): # Histogram feature feature_labels[i] = 'hf_' + 'RadiomiX_' + l elif l.startswith('Shape_'): # Shape feature feature_labels[i] = 'sf_' + 'RadiomiX_' + l elif l.startswith('LoG_'): # LoG feature feature_labels[i] = 'logf_' + 'RadiomiX_' + l elif l.startswith('Fractal_'): # Fractal feature feature_labels[i] = 'fracf_' + 'RadiomiX_' + l elif l.startswith('LocInt_'): # Location feature feature_labels[i] = 'locf_' + 'RadiomiX_' + l elif l.startswith('RGRD_'): # RGRD feature feature_labels[i] = 'rgrdf_' + 'RadiomiX_' + l elif l.startswith('Wavelet_'): # RGRD feature feature_labels[i] = 'waveletf_' + 'RadiomiX_' + l else: raise WORCKeyError(f'Unknown feature {l}.') # Initiate labels for pandas file panda_labels = ['feature_values', 'feature_labels'] # For each patient, convert features for i_patient in range(0, len(pids)): feature_values = features[i_patient, :].tolist() # Make an output folder per patient, remove invalid symbols. output = pids[i_patient] + segs[i_patient] output = output.replace(' ', '_') output = output.replace('(', '_') output = output.replace(')', '_') output = os.path.join(output_folder, output) # Check if output folder exists: otherwise create if not os.path.exists(output): os.mkdir(output) output = os.path.join(output, 'features.hdf5') print(f'\t Writing {output}') # Convert to pandas Series and save as hdf5 panda_data = pd.Series([feature_values, feature_labels], index=panda_labels, name='Image features') # Save the features to the .hdf5 file print('\t Saving image features') panda_data.to_hdf(output, 'image_features')