def apply_transform_parallelized(self, X): """ :param X: the data to which the delegate should be applied in parallel """ if self.nr_of_processes > 1: jobs_to_do = list() # distribute the data equally to all available cores number_of_items_to_process = PhotonDataHelper.find_n(X) number_of_items_for_each_core = int( np.ceil(number_of_items_to_process / self.nr_of_processes)) logger.info("NeuroBranch " + self.name + ": Using " + str(self.nr_of_processes) + " cores calculating " + str(number_of_items_for_each_core) + " items each") for start, stop in PhotonDataHelper.chunker( number_of_items_to_process, number_of_items_for_each_core): X_batched, _, _ = PhotonDataHelper.split_data( X, None, {}, start, stop) # copy my pipeline new_pipe_mr = self.copy_me() new_pipe_copy = new_pipe_mr.base_element new_pipe_copy.cache_folder = self.base_element.cache_folder new_pipe_copy.skip_loading = True new_pipe_copy._parallel_use = True del_job = dask.delayed(NeuroBranch.parallel_application)( new_pipe_copy, X_batched) jobs_to_do.append(del_job) dask.compute(*jobs_to_do)
def _prepare_data(self, X, y=None, **kwargs): logger.info( "Preparing data for outer fold " + str(self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr) + "..." ) # Prepare Train and validation set data train_indices = self.cross_validaton_info.outer_folds[ self.outer_fold_id ].train_indices test_indices = self.cross_validaton_info.outer_folds[ self.outer_fold_id ].test_indices self._validation_X, self._validation_y, self._validation_kwargs = PhotonDataHelper.split_data( X, y, kwargs, indices=train_indices ) self._test_X, self._test_y, self._test_kwargs = PhotonDataHelper.split_data( X, y, kwargs, indices=test_indices ) # write numbers to database info object self.result_object.number_samples_validation = self._validation_y.shape[0] self.result_object.number_samples_test = self._test_y.shape[0] if self._pipe._estimator_type == "classifier": self.result_object.class_distribution_validation = FoldInfo.data_overview( self._validation_y ) self.result_object.class_distribution_test = FoldInfo.data_overview( self._test_y )
def activate(self): if not PhotonRegistry.CUSTOM_ELEMENTS_FOLDER: raise ValueError( "To activate a custom elements folder, specify a folder when instantiating the registry " "module. Example: registry = PhotonRegistry('/MY/CUSTOM/ELEMENTS/FOLDER) " "In case you don't have any custom models, there is no need to activate the registry." ) if not os.path.exists(PhotonRegistry.CUSTOM_ELEMENTS_FOLDER): raise FileNotFoundError( "Couldn't find custom elements folder: {}".format( PhotonRegistry.CUSTOM_ELEMENTS_FOLDER)) if not os.path.isfile( os.path.join(PhotonRegistry.CUSTOM_ELEMENTS_FOLDER, 'CustomElements.json')): raise FileNotFoundError( "Couldn't find CustomElements.json. Did you register your element first?" ) # add folder to python path logger.info("Adding custom elements folder to system path...") sys.path.append(PhotonRegistry.CUSTOM_ELEMENTS_FOLDER) PhotonRegistry.ELEMENT_DICTIONARY.update( self.get_package_info(['CustomElements'])) logger.info('Successfully activated custom elements!')
def balanced_accuracy(y_true, y_pred): # = true negative rate if len(np.unique(y_true)) == 2: return (specificity(y_true, y_pred) + sensitivity(y_true, y_pred)) / 2 else: logger.info('Specificity (metric) is valid only for binary classification problems. You have ' + str(len(np.unique(y_true))) + ' classes.') return np.nan
def _prepare_optimization(self): logger.info("Preparing Hyperparameter Optimization...") pipeline_elements = [e for name, e in self._pipe.elements] self.optimizer = self.optimization_info.get_optimizer() if isinstance(self.optimizer, PhotonMasterOptimizer): self.optimizer.prepare(pipeline_elements, self.optimization_info.maximize_metric, self.objective_function) else: self.optimizer.prepare(pipeline_elements, self.optimization_info.maximize_metric) # we've got some super strange pymodm problems here # somehow some information from the previous outer fold lingers on and can be found within a completely new # instantiated OuterFoldMDB object # hence, clearing it self.result_object.tested_config_list = list() # copy constraint objects. if self.optimization_info.performance_constraints is not None: if isinstance(self.optimization_info.performance_constraints, list): self.constraint_objects = [ original.copy_me() for original in self.optimization_info.performance_constraints ] else: self.constraint_objects = [ self.optimization_info.performance_constraints.copy_me() ] else: self.constraint_objects = None
def _check_duplicate(photon_name, class_str, content): """ Helper function to check if the entry is either registered by a different name or if the name is already given to another class Parameters: ----------- * 'content': The content of the CustomElements.json * 'class_str' [str]: The namespace.Classname, where the class lives, from where it should be imported. * 'photon_name': The name of the element with which it is called within PHOTON Returns: -------- Bool, False if there is no key with this name and the class is not already registered with another key """ # check for duplicate name (dict key) if photon_name in content: logger.info('A PipelineElement named ' + photon_name + ' has already been registered.') return True # check for duplicate class_str if any(class_str in '.'.join([s[0], s[1]]) for s in content.values()): logger.info('The Class named ' + class_str + ' has already been registered.') return True return False
def __init__(self, patch_size=25, random_state=42, nr_of_processes=3): logger.info("Nr or processes: " + str(nr_of_processes)) super(PatchImages, self).__init__(output_img=True, nr_of_processes=nr_of_processes) # Todo: give cache folder to mother class self.patch_size = patch_size self.random_state = random_state
def sensitivity(y_true, y_pred): # = true positive rate, hit rate, recall if len(np.unique(y_true)) == 2: from sklearn.metrics import confusion_matrix tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() return tp / (tp + fn) else: logger.info('Sensitivity (metric) is valid only for binary classification problems. You have ' + str(len(np.unique(y_true))) + ' classes.') return np.nan
def specificity(y_true, y_pred): # = true negative rate if len(np.unique(y_true)) == 2: from sklearn.metrics import confusion_matrix tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() return tn / (tn + fp) else: logger.info('Specificity (metric) is valid only for binary classification problems. You have ' + str(len(np.unique(y_true))) + ' classes.') return np.nan
def list_rois(self, atlas: str): if atlas not in self.ATLAS_DICTIONARY.keys(): logger.info("Atlas {} is not supported.".format(atlas)) return atlas = self.get_atlas(atlas) roi_names = [roi.label for roi in atlas.roi_list] logger.info(str(roi_names)) return roi_names
def register(self, photon_name: str, class_str: str, element_type: str): """ Save element information to the JSON file Parameters: ----------- * 'photon_name' [str]: The string literal with which you want to access the class * 'class_str' [str]: The namespace of the class, like in the import statement * 'element_type' [str]: Can be 'Estimator' or 'Transformer' * 'custom_folder' [str]: All registrations are saved to this folder """ # check if folder exists if not PhotonRegistry.CUSTOM_ELEMENTS_FOLDER: raise ValueError( "To register an element, specify a custom elements folder when instantiating the registry " "module. Example: registry = PhotonRegistry('/MY/CUSTOM/ELEMENTS/FOLDER)" ) if not element_type == "Estimator" and not element_type == "Transformer": raise ValueError( "Variable element_type must be 'Estimator' or 'Transformer'") duplicate = self._check_duplicate( photon_name=photon_name, class_str=class_str, content=PhotonRegistry.CUSTOM_ELEMENTS) if not duplicate: python_file = os.path.join(PhotonRegistry.CUSTOM_ELEMENTS_FOLDER, class_str.split('.')[0] + '.py') if not os.path.isfile(python_file): raise FileNotFoundError( "Couldn't find python file {} in your custom elements folder. " "Please copy your file into this folder first!".format( python_file)) # add new element PhotonRegistry.CUSTOM_ELEMENTS[ photon_name] = class_str, element_type # write back to file self._write_to_json(PhotonRegistry.CUSTOM_ELEMENTS) logger.info('Adding PipelineElement ' + class_str + ' to CustomElements.json as "' + photon_name + '".') # activate custom elements self.activate() # check custom element logger.info("Running tests on custom element...") return self._run_tests(photon_name, element_type) else: logger.error('Could not register element!')
def data_overview(y): if len(y.shape) > 1: # one hot encoded logger.info( "One Hot Encoded data fold information not yet implemented") return {} else: unique, counts = np.unique(y, return_counts=True) unique = [str(u) for u in unique] counts = [int(c) for c in counts] return dict(zip(unique, counts))
def _check_custom_folder(custom_folder): if not os.path.exists(custom_folder): logger.info('Creating folder {}'.format(custom_folder)) os.makedirs(custom_folder) custom_file = os.path.join(custom_folder, 'CustomElements.json') if not os.path.isfile(custom_file): logger.info('Creating CustomElements.json') with open(custom_file, 'w') as f: json.dump('', f) return custom_folder
def write_convenience_files(self): if self.output_settings.save_output: logger.info( "Writing summary file, plots and prediction csv to result folder ..." ) self.write_summary() self.write_predictions_file() if self.output_settings.plots: self.plot_optimizer_history( self.results.hyperpipe_info.best_config_metric) self.eval_mean_time_components()
def save(self): if self.output_settings.mongodb_connect_url: connect(self.output_settings.mongodb_connect_url, alias='photon_core') logger.info('Write results to mongodb...') try: self.results.save() except DocumentTooLarge as e: logger.error( 'Could not save document into MongoDB: Document too large') if self.output_settings.save_output: logger.info("Writing results to project folder...") self.write_result_tree_to_file()
def _fit_dummy(self): if self.dummy_estimator is not None: logger.info("Running Dummy Estimator...") try: if isinstance(self._validation_X, np.ndarray): if len(self._validation_X.shape) > 2: logger.info( "Skipping dummy estimator because of too many dimensions" ) self.result_object.dummy_results = None return dummy_y = np.reshape(self._validation_y, (-1, 1)) self.dummy_estimator.fit(dummy_y, self._validation_y) train_scores = InnerFoldManager.score( self.dummy_estimator, self._validation_X, self._validation_y, metrics=self.optimization_info.metrics, ) # fill result tree with fold information inner_fold = MDBInnerFold() inner_fold.training = train_scores if self.cross_validaton_info.eval_final_performance: test_scores = InnerFoldManager.score( self.dummy_estimator, self._test_X, self._test_y, metrics=self.optimization_info.metrics, ) print_metrics("DUMMY", test_scores.metrics) inner_fold.validation = test_scores self.result_object.dummy_results = inner_fold # performaceConstraints: DummyEstimator if self.constraint_objects is not None: dummy_constraint_objs = [ opt for opt in self.constraint_objects if isinstance(opt, DummyPerformance) ] if dummy_constraint_objs: for dummy_constraint_obj in dummy_constraint_objs: dummy_constraint_obj.set_dummy_performance( self.result_object.dummy_results ) return inner_fold except Exception as e: logger.error(e) logger.info("Skipping dummy because of error..") return None else: logger.info("Skipping dummy ..")
def load_many_from_db(mongo_connect_url: str, pipe_names: list): """ Opens the PHOTON investigator and loads a hyperpipe performance results from a MongoDB instance Parameters --------- * 'mongo_connect_url' [str]: The MongoDB connection string including the database name * 'pipe_names' [list]: A list of the hyperpipe objects to load """ FlaskManager().set_mongo_db_url(mongo_connect_url) for pipe in pipe_names: url = Investigator.__build_url("m", pipe) logger.info("Your url is: " + url) FlaskManager().run_app()
def save(self): if self.output_settings.mongodb_connect_url: connect(self.output_settings.mongodb_connect_url, alias="photon_core") logger.debug("Write results to mongodb...") try: self.results.save() except DocumentTooLarge as e: logger.error( "Could not save document into MongoDB: Document too large") # try to reduce the amount of configs saved # if len(results_tree.outer_folds[0].tested_config_list) > 100: # for outer_fold in results_tree.outer_folds: # metrics_configs = [outer_fold.tested_configlist if self.output_settings.save_output: logger.info("Writing results to project folder...") self.write_result_tree_to_file()
def delete(self, photon_name): """ Delete Element from JSON file Parameters: ----------- * 'photon_name' [str]: The string literal encoding the class """ if photon_name in PhotonRegistry.CUSTOM_ELEMENTS: del PhotonRegistry.CUSTOM_ELEMENTS[photon_name] self._write_to_json(PhotonRegistry.CUSTOM_ELEMENTS) logger.info( 'Removing the PipelineElement named "{0}" from CustomElements.json.' .format(photon_name)) else: logger.info( 'Cannot remove "{0}" from CustomElements.json. Element has not been registered before.' .format(photon_name))
def plot_confusion_matrix(self, classes=None, normalize=False, title="Confusion matrix"): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ preds = ResultsHandler.get_test_predictions(self) cm = confusion_matrix(preds["y_true"], preds["y_pred"]) np.set_printoptions(precision=2) if normalize: cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] logger.info("Normalized confusion matrix") else: logger.info("Confusion matrix") logger.info(cm) plt.figure() cmap = plt.cm.Blues plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() if classes is None: classes = [ "class " + str(c + 1) for c in np.unique(preds["y_true"]) ] tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = ".2f" if normalize else "d" thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text( j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black", ) plt.tight_layout() plt.ylabel("True label") plt.xlabel("Predicted label") # plotlyFig = ResultsHandler.__plotlyfy(plt) plt.show()
def calc_config(self): ''' Calculates the configurations and the subset-fragmentation to evaluate. Implemented as a generator. The returned tracking vars are for internal use and need to be passed to process_result. :return: next configuration to test, subset-frag to use, tracking-vars :rtype: dict, int, dict ''' logger.info('**Fabolas: Starting initialization') for self._it in range(0, self._n_init): logger.debug('Fabolas: step ' + str(self._it) + ' (init)') start = time() result = self._init_models() tracking = {'overhead_time': time() - start} logger.debug( 'Fabolas: needed {t!s}s'.format(t=tracking['overhead_time'])) yield self._create_param_dict(result, tracking) self._X = np.array(self._X) self._Y = np.array(self._Y) self._cost = np.array(self._cost) logger.info('**Fabolas: Starting optimization') for self._it in range(self._n_init, self._num_iterations): logger.debug('Fabolas: step ' + str(self._it) + ' (opt)') start = time() result = self._optimize_config() tracking = {'overhead_time': time() - start} logger.debug( 'Fabolas: needed {t!s}s'.format(t=tracking['overhead_time'])) yield self._create_param_dict(result, tracking) logger.info('Fabolas: Final config') start = time() self._model_objective.train(self._X, self._Y, do_optimize=True) result = self.get_incumbent() tracking = {'overhead_time': time() - start} logger.debug( 'Fabolas: needed {t!s}s'.format(t=tracking['overhead_time'])) yield self._create_param_dict(result, tracking)
def _run_tests(self, photon_name, element_type): # this is a sneaky hack to avoid circular imports of PipelineElement imported_module = importlib.import_module("photonai.base") desired_class = getattr(imported_module, "PipelineElement") custom_element = desired_class(photon_name) # check if has fit, transform, predict if not hasattr(custom_element.base_element, 'fit'): raise NotImplementedError( "Custom element does not implement fit() method.") if element_type == 'Transformer' and not hasattr( custom_element.base_element, 'transform'): raise NotImplementedError( "Custom element does not implement transform() method.") if element_type == 'Estimator' and not hasattr( custom_element.base_element, 'predict'): raise NotImplementedError( "Custom element does not implement predict() method.") # check if estimator is regressor or classifier if element_type == 'Estimator': if hasattr(custom_element, '_estimator_type'): est_type = getattr(custom_element, '_estimator_type') if est_type == "regressor": X, y = load_boston(return_X_y=True) elif est_type == "classifier": X, y = load_breast_cancer(return_X_y=True) else: raise ValueError( "Custom element does not specify whether it is a regressor or classifier. " "Is {}".format(est_type)) else: raise NotImplementedError( "Custom element does not specify whether it is a regressor or classifier. " "Consider inheritance from ClassifierMixin or RegressorMixin or set " "_estimator_type manually.") else: X, y = load_boston(return_X_y=True) # try and test functionality kwargs = {'covariates': np.random.randn(len(y))} try: # test fit returned_element = custom_element.base_element.fit(X, y, **kwargs) except Exception as e: logger.info( "Not able to run tests on fit() method. Test data not compatible." ) return e if not isinstance(returned_element, custom_element.base_element.__class__): raise NotImplementedError("fit() method does not return self.") try: # test transform or predict (if base element does not implement transform method, predict should be called # by PipelineElement -> so we only need to test transform() if custom_element.needs_y: if element_type == 'Estimator': raise NotImplementedError("Estimator should not need y.") Xt, yt, kwargst = custom_element.base_element.transform( X, y, **kwargs) if 'covariates' not in kwargst.keys(): raise ValueError( "Custom element does not correctly transform kwargs although needs_y is True. " "If you change the number of samples in transform(), make sure to transform kwargs " "respectively.") if not len(kwargst['covariates']) == len(X): raise ValueError( "Custom element is not returning the correct number of samples!" ) elif custom_element.needs_covariates: if element_type == 'Estimator': yt, kwargst = custom_element.base_element.predict( X, **kwargs) if not len(yt) == len(y): raise ValueError( "Custom element is not returning the correct number of samples!" ) else: Xt, kwargst = custom_element.base_element.transform( X, **kwargs) if not len(Xt) == len(X) or not len( kwargst['covariates']) == len(X): raise ValueError( "Custom element is not returning the correct number of samples!" ) else: if element_type == 'Estimator': yt = custom_element.base_element.predict(X) if not len(yt) == len(y): raise ValueError( "Custom element is not returning the correct number of samples!" ) else: Xt = custom_element.base_element.transform(X) if not len(Xt) == len(X): raise ValueError( "Custom element is not returning the correct number of samples!" ) except ValueError as ve: if "too many values to unpack" in ve.args[0]: raise ValueError( "Custom element does not return X, y and kwargs the way it should " "according to needs_y and needs_covariates.") else: logger.info(ve.args[0]) return ve except Exception as e: logger.info(e.args[0]) logger.info( "Not able to run tests on transform() or predict() method. Test data probably not compatible." ) return e logger.info('All tests on custom element passed.')
def fit(self, X, y=None, **kwargs): logger.photon_system_log('') logger.photon_system_log( '***************************************************************************************************************' ) logger.photon_system_log('Outer Cross validation Fold {}'.format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr)) logger.photon_system_log( '***************************************************************************************************************' ) self._prepare_data(X, y, **kwargs) self._fit_dummy() self._generate_inner_folds() self._prepare_optimization() outer_fold_fit_start_time = datetime.datetime.now() self.best_metric_yet = None self.tested_config_counter = 0 # distribute number of folds to encapsulated child hyperpipes # self.__distribute_cv_info_to_hyperpipe_children(num_of_folds=num_folds, # outer_fold_counter=outer_fold_counter) if self.cross_validaton_info.calculate_metrics_per_fold: self.fold_operation = FoldOperations.MEAN else: self.fold_operation = FoldOperations.RAW self.max_nr_of_configs = '' if hasattr(self.optimizer, 'n_configurations'): self.max_nr_of_configs = str(self.optimizer.n_configurations) if isinstance(self.optimizer, PhotonMasterOptimizer): self.optimizer.optimize() else: # do the optimizing for current_config in self.optimizer.ask: self.objective_function(current_config) logger.clean_info( '---------------------------------------------------------------------------------------------------------------' ) logger.info( 'Hyperparameter Optimization finished. Now finding best configuration .... ' ) print(self.tested_config_counter) # now go on with the best config found if self.tested_config_counter > 0: best_config_outer_fold = self.optimization_info.get_optimum_config( self.result_object.tested_config_list, self.fold_operation) if not best_config_outer_fold: raise Exception("No best config was found!") # ... and create optimal pipeline optimum_pipe = self.copy_pipe_fnc() if self.cache_updater is not None: self.cache_updater(optimum_pipe, self.cache_folder, "fixed_fold_id") optimum_pipe.caching = False # set self to best config optimum_pipe.set_params(**best_config_outer_fold.config_dict) # Todo: set all children to best config and inform to NOT optimize again, ONLY fit # for child_name, child_config in best_config_outer_fold_mdb.children_config_dict.items(): # if child_config: # # in case we have a pipeline stacking we need to identify the particular subhyperpipe # splitted_name = child_name.split('__') # if len(splitted_name) > 1: # stacking_element = self.optimum_pipe.named_steps[splitted_name[0]] # pipe_element = stacking_element.elements[splitted_name[1]] # else: # pipe_element = self.optimum_pipe.named_steps[child_name] # pipe_element.set_params(**child_config) # pipe_element.is_final_fit = True # self.__distribute_cv_info_to_hyperpipe_children(reset=True) logger.debug( 'Fitting model with best configuration of outer fold...') optimum_pipe.fit(self._validation_X, self._validation_y, **self._validation_kwargs) self.result_object.best_config = best_config_outer_fold # save test performance best_config_performance_mdb = MDBInnerFold() best_config_performance_mdb.fold_nr = -99 best_config_performance_mdb.number_samples_training = self._validation_y.shape[ 0] best_config_performance_mdb.number_samples_validation = self._test_y.shape[ 0] best_config_performance_mdb.feature_importances = optimum_pipe.feature_importances_ if self.cross_validaton_info.eval_final_performance: # Todo: generate mean and std over outer folds as well. move this items to the top logger.info( 'Calculating best model performance on test set...') logger.debug('...scoring test data') test_score_mdb = InnerFoldManager.score( optimum_pipe, self._test_X, self._test_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id].test_indices, metrics=self.optimization_info.metrics, **self._test_kwargs) logger.debug('... scoring training data') train_score_mdb = InnerFoldManager.score( optimum_pipe, self._validation_X, self._validation_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id].train_indices, metrics=self.optimization_info.metrics, training=True, **self._validation_kwargs) best_config_performance_mdb.training = train_score_mdb best_config_performance_mdb.validation = test_score_mdb print_double_metrics(train_score_mdb.metrics, test_score_mdb.metrics) else: def _copy_inner_fold_means(metric_dict): # We copy all mean values from validation to the best config # training train_item_metrics = {} for m in metric_dict: if m.operation == str(self.fold_operation): train_item_metrics[m.metric_name] = m.value train_item = MDBScoreInformation() train_item.metrics_copied_from_inner = True train_item.metrics = train_item_metrics return train_item # training best_config_performance_mdb.training = _copy_inner_fold_means( best_config_outer_fold.metrics_train) # validation best_config_performance_mdb.validation = _copy_inner_fold_means( best_config_outer_fold.metrics_test) # write best config performance to best config item self.result_object.best_config.best_config_score = best_config_performance_mdb logger.info('Computations in outer fold {} took {} minutes.'.format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr, (datetime.datetime.now() - outer_fold_fit_start_time).total_seconds() / 60))
def objective_function(self, current_config): if current_config is None: return logger.clean_info( '---------------------------------------------------------------------------------------------------------------' ) self.tested_config_counter += 1 if hasattr(self.optimizer, 'ask_for_pipe'): pipe_ctor = self.optimizer.ask_for_pipe() else: pipe_ctor = self.copy_pipe_fnc # self.__distribute_cv_info_to_hyperpipe_children(reset=True, config_counter=tested_config_counter) hp = InnerFoldManager(pipe_ctor, current_config, self.optimization_info, self.cross_validaton_info, self.outer_fold_id, self.constraint_objects, cache_folder=self.cache_folder, cache_updater=self.cache_updater) # Test the configuration cross validated by inner_cv object current_config_mdb = hp.fit(self._validation_X, self._validation_y, **self._validation_kwargs) current_config_mdb.config_nr = self.tested_config_counter if not current_config_mdb.config_failed: metric_train = MDBHelper.get_metric( current_config_mdb, self.fold_operation, self.optimization_info.best_config_metric) metric_test = MDBHelper.get_metric( current_config_mdb, self.fold_operation, self.optimization_info.best_config_metric, train=False) if metric_train is None or metric_test is None: raise Exception( "Config did not fail, but did not get any metrics either....!!?" ) config_performance = (metric_train, metric_test) if self.best_metric_yet is None: self.best_metric_yet = config_performance self.current_best_config = current_config_mdb else: # check if we have the next superstar around that exceeds any old performance if self.optimization_info.maximize_metric: if metric_test > self.best_metric_yet[1]: self.best_metric_yet = config_performance self.current_best_config.save_memory() self.current_best_config = current_config_mdb else: current_config_mdb.save_memory() else: if metric_test < self.best_metric_yet[1]: self.best_metric_yet = config_performance self.current_best_config.save_memory() self.current_best_config = current_config_mdb else: current_config_mdb.save_memory() # Print Result for config computation_duration = current_config_mdb.computation_end_time - current_config_mdb.computation_start_time logger.info('Computed configuration ' + str(self.tested_config_counter) + "/" + self.max_nr_of_configs + " in " + str(computation_duration)) logger.info("Performance: " + self.optimization_info.best_config_metric + " - Train: " + "%.4f" % config_performance[0] + ", Validation: " + "%.4f" % config_performance[1]) logger.info("Best Performance So Far: " + self.optimization_info.best_config_metric + " - Train: " + "%.4f" % self.best_metric_yet[0] + ", Validation: " + "%.4f" % self.best_metric_yet[1]) else: config_performance = (-1, -1) # Print Result for config logger.debug('...failed:') logger.error(current_config_mdb.config_error) # add config to result tree self.result_object.tested_config_list.append(current_config_mdb) # 3. inform optimizer about performance logger.debug( "Telling hyperparameter optimizer about recent performance.") if isinstance(self.optimizer, PhotonSlaveOptimizer): self.optimizer.tell(current_config, config_performance) logger.debug("Asking hyperparameter optimizer for new config.") if self.optimization_info.maximize_metric: return 1 - config_performance[1] else: return config_performance[1]
def prepare(self, pipeline_elements, maximize_metric): self.pipeline_elements = pipeline_elements self.ask = self.next_config_generator() self.param_grid = create_global_config_grid(self.pipeline_elements) logger.info("Grid Search generated " + str(len(self.param_grid)) + " configurations")
def fit(self, X, y, **kwargs): """ Iterates over cross-validation folds and trains the pipeline, then uses it for predictions. Calculates metrics per fold and averages them over fold. :param X: Training and test data :param y: Training and test targets :returns: configuration class for result tree that monitors training and test performance """ # needed for testing Timeboxed Random Grid Search # time.sleep(35) config_item = MDBConfig() config_item.config_dict = self.params config_item.inner_folds = [] config_item.metrics_test = [] config_item.metrics_train = [] config_item.computation_start_time = datetime.datetime.now() try: # do inner cv for idx, (inner_fold_id, inner_fold) in enumerate( self.cross_validation_infos.inner_folds[ self.outer_fold_id].items()): train, test = inner_fold.train_indices, inner_fold.test_indices # split kwargs according to cross validation train_X, train_y, kwargs_cv_train = PhotonDataHelper.split_data( X, y, kwargs, indices=train) test_X, test_y, kwargs_cv_test = PhotonDataHelper.split_data( X, y, kwargs, indices=test) new_pipe = self.pipe() if self.cache_folder is not None and self.cache_updater is not None: self.cache_updater(new_pipe, self.cache_folder, inner_fold_id) if not config_item.human_readable_config: config_item.human_readable_config = PhotonPrintHelper.config_to_human_readable_dict( new_pipe, self.params) logger.clean_info( json.dumps(config_item.human_readable_config, indent=4, sort_keys=True)) job_data = InnerFoldManager.InnerCVJob( pipe=new_pipe, config=dict(self.params), metrics=self.optimization_infos.metrics, callbacks=self.optimization_constraints, train_data=InnerFoldManager.JobData( train_X, train_y, train, kwargs_cv_train), test_data=InnerFoldManager.JobData(test_X, test_y, test, kwargs_cv_test), ) # only for unparallel processing # inform children in which inner fold we are # self.pipe.distribute_cv_info_to_hyperpipe_children(inner_fold_counter=fold_cnt) # self.mother_inner_fold_handle(fold_cnt) # --> write that output in InnerFoldManager! # logger.debug(config_item.human_readable_config) fold_nr = idx + 1 logger.debug("calculating inner fold " + str(fold_nr) + "...") curr_test_fold, curr_train_fold = InnerFoldManager.fit_and_score( job_data) logger.debug("Performance inner fold " + str(fold_nr)) print_double_metrics( curr_train_fold.metrics, curr_test_fold.metrics, photon_system_log=False, ) durations = job_data.pipe.time_monitor self.update_config_item_with_inner_fold( config_item=config_item, fold_cnt=fold_nr, curr_train_fold=curr_train_fold, curr_test_fold=curr_test_fold, time_monitor=durations, feature_importances=new_pipe.feature_importances_, ) if isinstance(self.optimization_constraints, list): break_cv = 0 for cf in self.optimization_constraints: if not cf.shall_continue(config_item): logger.info( "Skipped further cross validation after fold " + str(fold_nr) + " due to performance constraints in " + cf.metric) break_cv += 1 break if break_cv > 0: break elif self.optimization_constraints is not None: if not self.optimization_constraints.shall_continue( config_item): logger.info( "Skipped further cross validation after fold " + str(fold_nr) + " due to performance constraints in " + cf.metric) break InnerFoldManager.process_fit_results( config_item, self.cross_validation_infos.calculate_metrics_across_folds, self.cross_validation_infos.calculate_metrics_per_fold, self.optimization_infos.metrics, ) except Exception as e: if self.raise_error: raise e logger.error(e) logger.error(traceback.format_exc()) traceback.print_exc() if not isinstance(e, Warning): config_item.config_failed = True config_item.config_error = str(e) warnings.warn("One test iteration of pipeline failed with error") logger.debug("...done with") logger.debug( json.dumps(config_item.human_readable_config, indent=4, sort_keys=True)) config_item.computation_end_time = datetime.datetime.now() return config_item
def fit(self, X, y, **kwargs): self.pipe = self.hyperpipe_constructor() # we need a mongodb to collect the results! if not self.pipe.output_settings.mongodb_connect_url: raise ValueError( "MongoDB connection string must be given for permutation tests" ) # Get all specified metrics best_config_metric = self.pipe.optimization.best_config_metric self.metrics = PermutationTest.manage_metrics( self.pipe.optimization.metrics, self.pipe.elements[-1], best_config_metric) # at first we do a reference optimization y_true = y # Run with true labels connect(self.pipe.output_settings.mongodb_connect_url, alias="photon_core") # Check if it already exists in DB try: existing_reference = MDBHyperpipe.objects.raw({ 'permutation_id': self.mother_permutation_id, 'computation_completed': True }).first() if not existing_reference.permutation_test: existing_reference.permutation_test = MDBPermutationResults( n_perms=self.n_perms) existing_reference.save() # check if all outer folds exist logger.info( "Found hyperpipe computation with true targets, skipping the optimization process with true targets" ) except DoesNotExist: # if we havent computed the reference value do it: logger.info("Calculating Reference Values with true targets.") try: self.pipe.permutation_id = self.mother_permutation_id self.pipe.fit(X, y_true, **kwargs) self.pipe.results.computation_completed = True self.pipe.results.permutation_test = MDBPermutationResults( n_perms=self.n_perms) self.pipe.results.save() existing_reference = self.pipe.results except Exception as e: if self.pipe.results is not None: self.pipe.results.permutation_failed = str(e) self.pipe.results.save() # check for sanity if not self.__validate_usability(existing_reference): raise RuntimeError( "Permutation Test is not adviced because results are not better than dummy. Aborting." ) # find how many permutations have been computed already: # existing_permutations = MDBHyperpipe.objects.raw({'permutation_id': self.permutation_id, # 'computation_completed': True}).count() existing_permutations = list( MDBHyperpipe.objects.raw({ 'permutation_id': self.permutation_id, 'computation_completed': True }).only('name')) existing_permutations = [ int(perm_run.name.split('_')[-1]) for perm_run in existing_permutations ] # we do one more permutation that is left in case the last permutation runs broke, one for each parallel if len(existing_permutations) > 0: perms_todo = set(np.arange( self.n_perms)) - set(existing_permutations) else: perms_todo = np.arange(self.n_perms) logger.info(str(len(perms_todo)) + " permutation runs to do") if len(perms_todo) > 0: # create permutation labels np.random.seed(self.random_state) self.permutations = [ np.random.permutation(y_true) for _ in range(self.n_perms) ] # Run parallel pool job_list = list() if self.n_processes > 1: try: my_client = Client(threads_per_worker=1, n_workers=self.n_processes, processes=False) for perm_run in perms_todo: del_job = dask.delayed( PermutationTest.run_parallelized_permutation)( self.hyperpipe_constructor, X, perm_run, self.permutations[perm_run], self.permutation_id, self.verbosity, **kwargs) job_list.append(del_job) dask.compute(*job_list) finally: my_client.close() else: for perm_run in perms_todo: PermutationTest.run_parallelized_permutation( self.hyperpipe_constructor, X, perm_run, self.permutations[perm_run], self.permutation_id, self.verbosity, **kwargs) perm_result = self._calculate_results(self.permutation_id) performance_df = pd.DataFrame( dict([(name, [i]) for name, i in perm_result.p_values.items()])) performance_df.to_csv( os.path.join(existing_reference.output_folder, 'permutation_test_results.csv')) return self
def collect_results(self, result): # This is called whenever foo_pool(i) returns a result. # result_list is modified only by the main process, not the pool workers. logger.info("Finished Permutation Run" + str(result))
def start_flask(storage, pipe_name): url = Investigator.__build_url(storage, pipe_name) logger.info("Your url is: " + url) Investigator.__delayed_browser(url) FlaskManager().run_app()
def _calculate_results( permutation_id, save_to_db=True, mongodb_path="mongodb://trap-umbriel:27017/photon_results"): logger.info("Calculating permutation test results") try: mother_permutation = PermutationTest.find_reference( mongodb_path, permutation_id) # mother_permutation = MDBHyperpipe.objects.raw({'permutation_id': PermutationTest.get_mother_permutation_id(permutation_id), # 'computation_completed': True}).first() except DoesNotExist: return None else: all_permutations = list( MDBHyperpipe.objects.raw({ 'permutation_id': permutation_id, 'computation_completed': True }).project({'metrics_test': 1})) # all_permutations = MDBHyperpipe.objects.raw({'permutation_id': permutation_id, # 'computation_completed': True}).only('metrics_test') number_of_permutations = len(all_permutations) if number_of_permutations == 0: number_of_permutations = 1 true_performances = dict([(m.metric_name, m.value) for m in mother_permutation.metrics_test if m.operation == "FoldOperations.MEAN"]) perm_performances = dict() metric_list = list( set([m.metric_name for m in mother_permutation.metrics_test])) metrics = PermutationTest.manage_metrics( metric_list, None, mother_permutation.hyperpipe_info.best_config_metric) for _, metric in metrics.items(): perm_performances[metric["name"]] = [ m.value for i in all_permutations for m in i.metrics_test if m.metric_name == metric["name"] and m.operation == "FoldOperations.MEAN" ] # Calculate p-value p = PermutationTest.calculate_p( true_performance=true_performances, perm_performances=perm_performances, metrics=metrics, n_perms=number_of_permutations) p_text = dict() for _, metric in metrics.items(): if p[metric['name']] == 0: p_text[metric['name']] = "p < {}".format( str(1 / number_of_permutations)) else: p_text[metric['name']] = "p = {}".format(p[metric['name']]) # Print results logger.clean_info(""" Done with permutations... Results Permutation test =============================================== """) for _, metric in metrics.items(): logger.clean_info(""" Metric: {} True Performance: {} p Value: {} """.format(metric['name'], true_performances[metric['name']], p_text[metric['name']])) if save_to_db: # Write results to results object if mother_permutation.permutation_test is None: perm_results = MDBPermutationResults( n_perms=number_of_permutations) else: perm_results = mother_permutation.permutation_test perm_results.n_perms_done = number_of_permutations results_all_metrics = list() for _, metric in metrics.items(): perm_metrics = MDBPermutationMetrics( metric_name=metric['name'], p_value=p[metric['name']], metric_value=true_performances[metric['name']]) perm_metrics.values_permutations = perm_performances[ metric['name']] results_all_metrics.append(perm_metrics) perm_results.metrics = results_all_metrics mother_permutation.permutation_test = perm_results mother_permutation.save() if mother_permutation.permutation_test is not None: n_perms = mother_permutation.permutation_test.n_perms else: # we guess? n_perms = 1000 result = PermutationTest.PermutationResult(true_performances, perm_performances, p, number_of_permutations, n_perms) return result