Example #1
0
    def apply_transform_parallelized(self, X):
        """

        :param X: the data to which the delegate should be applied in parallel
        """

        if self.nr_of_processes > 1:

            jobs_to_do = list()

            # distribute the data equally to all available cores
            number_of_items_to_process = PhotonDataHelper.find_n(X)
            number_of_items_for_each_core = int(
                np.ceil(number_of_items_to_process / self.nr_of_processes))
            logger.info("NeuroBranch " + self.name + ": Using " +
                        str(self.nr_of_processes) + " cores calculating " +
                        str(number_of_items_for_each_core) + " items each")
            for start, stop in PhotonDataHelper.chunker(
                    number_of_items_to_process, number_of_items_for_each_core):
                X_batched, _, _ = PhotonDataHelper.split_data(
                    X, None, {}, start, stop)

                # copy my pipeline
                new_pipe_mr = self.copy_me()
                new_pipe_copy = new_pipe_mr.base_element
                new_pipe_copy.cache_folder = self.base_element.cache_folder
                new_pipe_copy.skip_loading = True
                new_pipe_copy._parallel_use = True

                del_job = dask.delayed(NeuroBranch.parallel_application)(
                    new_pipe_copy, X_batched)
                jobs_to_do.append(del_job)

            dask.compute(*jobs_to_do)
Example #2
0
    def _prepare_data(self, X, y=None, **kwargs):
        logger.info(
            "Preparing data for outer fold "
            + str(self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr)
            + "..."
        )
        # Prepare Train and validation set data
        train_indices = self.cross_validaton_info.outer_folds[
            self.outer_fold_id
        ].train_indices
        test_indices = self.cross_validaton_info.outer_folds[
            self.outer_fold_id
        ].test_indices
        self._validation_X, self._validation_y, self._validation_kwargs = PhotonDataHelper.split_data(
            X, y, kwargs, indices=train_indices
        )
        self._test_X, self._test_y, self._test_kwargs = PhotonDataHelper.split_data(
            X, y, kwargs, indices=test_indices
        )

        # write numbers to database info object
        self.result_object.number_samples_validation = self._validation_y.shape[0]
        self.result_object.number_samples_test = self._test_y.shape[0]
        if self._pipe._estimator_type == "classifier":
            self.result_object.class_distribution_validation = FoldInfo.data_overview(
                self._validation_y
            )
            self.result_object.class_distribution_test = FoldInfo.data_overview(
                self._test_y
            )
Example #3
0
    def activate(self):
        if not PhotonRegistry.CUSTOM_ELEMENTS_FOLDER:
            raise ValueError(
                "To activate a custom elements folder, specify a folder when instantiating the registry "
                "module. Example: registry = PhotonRegistry('/MY/CUSTOM/ELEMENTS/FOLDER) "
                "In case you don't have any custom models, there is no need to activate the registry."
            )
        if not os.path.exists(PhotonRegistry.CUSTOM_ELEMENTS_FOLDER):
            raise FileNotFoundError(
                "Couldn't find custom elements folder: {}".format(
                    PhotonRegistry.CUSTOM_ELEMENTS_FOLDER))
        if not os.path.isfile(
                os.path.join(PhotonRegistry.CUSTOM_ELEMENTS_FOLDER,
                             'CustomElements.json')):
            raise FileNotFoundError(
                "Couldn't find CustomElements.json. Did you register your element first?"
            )

        # add folder to python path
        logger.info("Adding custom elements folder to system path...")
        sys.path.append(PhotonRegistry.CUSTOM_ELEMENTS_FOLDER)

        PhotonRegistry.ELEMENT_DICTIONARY.update(
            self.get_package_info(['CustomElements']))
        logger.info('Successfully activated custom elements!')
Example #4
0
def balanced_accuracy(y_true, y_pred):  # = true negative rate
    if len(np.unique(y_true)) == 2:
        return (specificity(y_true, y_pred) + sensitivity(y_true, y_pred)) / 2
    else:
        logger.info('Specificity (metric) is valid only for binary classification problems. You have ' +
                      str(len(np.unique(y_true))) + ' classes.')
        return np.nan
Example #5
0
    def _prepare_optimization(self):

        logger.info("Preparing Hyperparameter Optimization...")
        pipeline_elements = [e for name, e in self._pipe.elements]

        self.optimizer = self.optimization_info.get_optimizer()
        if isinstance(self.optimizer, PhotonMasterOptimizer):
            self.optimizer.prepare(pipeline_elements,
                                   self.optimization_info.maximize_metric,
                                   self.objective_function)
        else:
            self.optimizer.prepare(pipeline_elements,
                                   self.optimization_info.maximize_metric)

        # we've got some super strange pymodm problems here
        # somehow some information from the previous outer fold lingers on and can be found within a completely new
        # instantiated OuterFoldMDB object
        # hence, clearing it
        self.result_object.tested_config_list = list()

        # copy constraint objects.
        if self.optimization_info.performance_constraints is not None:
            if isinstance(self.optimization_info.performance_constraints,
                          list):
                self.constraint_objects = [
                    original.copy_me() for original in
                    self.optimization_info.performance_constraints
                ]
            else:
                self.constraint_objects = [
                    self.optimization_info.performance_constraints.copy_me()
                ]
        else:
            self.constraint_objects = None
Example #6
0
    def _check_duplicate(photon_name, class_str, content):
        """
        Helper function to check if the entry is either registered by a different name or if the name is already given
        to another class

         Parameters:
        -----------
        * 'content':
          The content of the CustomElements.json
        * 'class_str' [str]:
          The namespace.Classname, where the class lives, from where it should be imported.
        * 'photon_name':
          The name of the element with which it is called within PHOTON
        Returns:
        --------
        Bool, False if there is no key with this name and the class is not already registered with another key
        """

        # check for duplicate name (dict key)
        if photon_name in content:
            logger.info('A PipelineElement named ' + photon_name +
                        ' has already been registered.')
            return True

        # check for duplicate class_str
        if any(class_str in '.'.join([s[0], s[1]]) for s in content.values()):
            logger.info('The Class named ' + class_str +
                        ' has already been registered.')
            return True
        return False
    def __init__(self, patch_size=25, random_state=42, nr_of_processes=3):
        logger.info("Nr or processes: " + str(nr_of_processes))
        super(PatchImages, self).__init__(output_img=True,
                                          nr_of_processes=nr_of_processes)
        # Todo: give cache folder to mother class

        self.patch_size = patch_size
        self.random_state = random_state
Example #8
0
def sensitivity(y_true, y_pred):  # = true positive rate, hit rate, recall
    if len(np.unique(y_true)) == 2:
        from sklearn.metrics import confusion_matrix
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        return tp / (tp + fn)
    else:
        logger.info('Sensitivity (metric) is valid only for binary classification problems. You have ' +
                      str(len(np.unique(y_true))) + ' classes.')
        return np.nan
Example #9
0
def specificity(y_true, y_pred):  # = true negative rate
    if len(np.unique(y_true)) == 2:
        from sklearn.metrics import confusion_matrix
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        return tn / (tn + fp)
    else:
        logger.info('Specificity (metric) is valid only for binary classification problems. You have ' +
                      str(len(np.unique(y_true))) + ' classes.')
        return np.nan
Example #10
0
    def list_rois(self, atlas: str):
        if atlas not in self.ATLAS_DICTIONARY.keys():
            logger.info("Atlas {} is not supported.".format(atlas))
            return

        atlas = self.get_atlas(atlas)
        roi_names = [roi.label for roi in atlas.roi_list]
        logger.info(str(roi_names))
        return roi_names
Example #11
0
    def register(self, photon_name: str, class_str: str, element_type: str):
        """
        Save element information to the JSON file

        Parameters:
        -----------
        * 'photon_name' [str]:
          The string literal with which you want to access the class
        * 'class_str' [str]:
          The namespace of the class, like in the import statement
        * 'element_type' [str]:
          Can be 'Estimator' or 'Transformer'
        * 'custom_folder' [str]:
          All registrations are saved to this folder
        """

        # check if folder exists
        if not PhotonRegistry.CUSTOM_ELEMENTS_FOLDER:
            raise ValueError(
                "To register an element, specify a custom elements folder when instantiating the registry "
                "module. Example: registry = PhotonRegistry('/MY/CUSTOM/ELEMENTS/FOLDER)"
            )

        if not element_type == "Estimator" and not element_type == "Transformer":
            raise ValueError(
                "Variable element_type must be 'Estimator' or 'Transformer'")

        duplicate = self._check_duplicate(
            photon_name=photon_name,
            class_str=class_str,
            content=PhotonRegistry.CUSTOM_ELEMENTS)

        if not duplicate:
            python_file = os.path.join(PhotonRegistry.CUSTOM_ELEMENTS_FOLDER,
                                       class_str.split('.')[0] + '.py')
            if not os.path.isfile(python_file):
                raise FileNotFoundError(
                    "Couldn't find python file {} in your custom elements folder. "
                    "Please copy your file into this folder first!".format(
                        python_file))
            # add new element
            PhotonRegistry.CUSTOM_ELEMENTS[
                photon_name] = class_str, element_type

            # write back to file
            self._write_to_json(PhotonRegistry.CUSTOM_ELEMENTS)
            logger.info('Adding PipelineElement ' + class_str +
                        ' to CustomElements.json as "' + photon_name + '".')

            # activate custom elements
            self.activate()

            # check custom element
            logger.info("Running tests on custom element...")
            return self._run_tests(photon_name, element_type)
        else:
            logger.error('Could not register element!')
Example #12
0
 def data_overview(y):
     if len(y.shape) > 1:
         # one hot encoded
         logger.info(
             "One Hot Encoded data fold information not yet implemented")
         return {}
     else:
         unique, counts = np.unique(y, return_counts=True)
         unique = [str(u) for u in unique]
         counts = [int(c) for c in counts]
         return dict(zip(unique, counts))
Example #13
0
    def _check_custom_folder(custom_folder):
        if not os.path.exists(custom_folder):
            logger.info('Creating folder {}'.format(custom_folder))
            os.makedirs(custom_folder)

        custom_file = os.path.join(custom_folder, 'CustomElements.json')
        if not os.path.isfile(custom_file):
            logger.info('Creating CustomElements.json')
            with open(custom_file, 'w') as f:
                json.dump('', f)

        return custom_folder
Example #14
0
    def write_convenience_files(self):
        if self.output_settings.save_output:
            logger.info(
                "Writing summary file, plots and prediction csv to result folder ..."
            )
            self.write_summary()
            self.write_predictions_file()

        if self.output_settings.plots:
            self.plot_optimizer_history(
                self.results.hyperpipe_info.best_config_metric)
            self.eval_mean_time_components()
Example #15
0
    def save(self):

        if self.output_settings.mongodb_connect_url:
            connect(self.output_settings.mongodb_connect_url,
                    alias='photon_core')
            logger.info('Write results to mongodb...')
            try:
                self.results.save()
            except DocumentTooLarge as e:
                logger.error(
                    'Could not save document into MongoDB: Document too large')
        if self.output_settings.save_output:
            logger.info("Writing results to project folder...")
            self.write_result_tree_to_file()
Example #16
0
    def _fit_dummy(self):
        if self.dummy_estimator is not None:
            logger.info("Running Dummy Estimator...")
            try:
                if isinstance(self._validation_X, np.ndarray):
                    if len(self._validation_X.shape) > 2:
                        logger.info(
                            "Skipping dummy estimator because of too many dimensions"
                        )
                        self.result_object.dummy_results = None
                        return
                dummy_y = np.reshape(self._validation_y, (-1, 1))
                self.dummy_estimator.fit(dummy_y, self._validation_y)
                train_scores = InnerFoldManager.score(
                    self.dummy_estimator,
                    self._validation_X,
                    self._validation_y,
                    metrics=self.optimization_info.metrics,
                )

                # fill result tree with fold information
                inner_fold = MDBInnerFold()
                inner_fold.training = train_scores

                if self.cross_validaton_info.eval_final_performance:
                    test_scores = InnerFoldManager.score(
                        self.dummy_estimator,
                        self._test_X,
                        self._test_y,
                        metrics=self.optimization_info.metrics,
                    )
                    print_metrics("DUMMY", test_scores.metrics)
                    inner_fold.validation = test_scores

                self.result_object.dummy_results = inner_fold

                # performaceConstraints: DummyEstimator
                if self.constraint_objects is not None:
                    dummy_constraint_objs = [
                        opt
                        for opt in self.constraint_objects
                        if isinstance(opt, DummyPerformance)
                    ]
                    if dummy_constraint_objs:
                        for dummy_constraint_obj in dummy_constraint_objs:
                            dummy_constraint_obj.set_dummy_performance(
                                self.result_object.dummy_results
                            )

                return inner_fold
            except Exception as e:
                logger.error(e)
                logger.info("Skipping dummy because of error..")
                return None
        else:
            logger.info("Skipping dummy ..")
Example #17
0
    def load_many_from_db(mongo_connect_url: str, pipe_names: list):
        """
        Opens the PHOTON investigator and
        loads a hyperpipe performance results from a MongoDB instance

        Parameters
        ---------
        * 'mongo_connect_url' [str]:
            The MongoDB connection string including the database name
        * 'pipe_names' [list]:
            A list of the hyperpipe objects to load
        """

        FlaskManager().set_mongo_db_url(mongo_connect_url)
        for pipe in pipe_names:
            url = Investigator.__build_url("m", pipe)
            logger.info("Your url is: " + url)
        FlaskManager().run_app()
Example #18
0
    def save(self):

        if self.output_settings.mongodb_connect_url:
            connect(self.output_settings.mongodb_connect_url,
                    alias="photon_core")
            logger.debug("Write results to mongodb...")
            try:
                self.results.save()
            except DocumentTooLarge as e:
                logger.error(
                    "Could not save document into MongoDB: Document too large")
                # try to reduce the amount of configs saved
                # if len(results_tree.outer_folds[0].tested_config_list) > 100:
                #     for outer_fold in results_tree.outer_folds:
                #         metrics_configs = [outer_fold.tested_configlist

        if self.output_settings.save_output:
            logger.info("Writing results to project folder...")
            self.write_result_tree_to_file()
Example #19
0
    def delete(self, photon_name):
        """
        Delete Element from JSON file

        Parameters:
        -----------
        * 'photon_name' [str]:
          The string literal encoding the class
        """

        if photon_name in PhotonRegistry.CUSTOM_ELEMENTS:
            del PhotonRegistry.CUSTOM_ELEMENTS[photon_name]

            self._write_to_json(PhotonRegistry.CUSTOM_ELEMENTS)
            logger.info(
                'Removing the PipelineElement named "{0}" from CustomElements.json.'
                .format(photon_name))
        else:
            logger.info(
                'Cannot remove "{0}" from CustomElements.json. Element has not been registered before.'
                .format(photon_name))
Example #20
0
    def plot_confusion_matrix(self,
                              classes=None,
                              normalize=False,
                              title="Confusion matrix"):
        """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        """

        preds = ResultsHandler.get_test_predictions(self)
        cm = confusion_matrix(preds["y_true"], preds["y_pred"])
        np.set_printoptions(precision=2)
        if normalize:
            cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
            logger.info("Normalized confusion matrix")
        else:
            logger.info("Confusion matrix")
        logger.info(cm)

        plt.figure()
        cmap = plt.cm.Blues
        plt.imshow(cm, interpolation="nearest", cmap=cmap)
        plt.title(title)
        plt.colorbar()

        if classes is None:
            classes = [
                "class " + str(c + 1) for c in np.unique(preds["y_true"])
            ]
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        fmt = ".2f" if normalize else "d"
        thresh = cm.max() / 2.0
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(
                j,
                i,
                format(cm[i, j], fmt),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black",
            )

        plt.tight_layout()
        plt.ylabel("True label")
        plt.xlabel("Predicted label")
        # plotlyFig = ResultsHandler.__plotlyfy(plt)
        plt.show()
Example #21
0
    def calc_config(self):
        '''
            Calculates the configurations and the subset-fragmentation to evaluate.
            Implemented as a generator.

            The returned tracking vars are for internal use and need to be passed to process_result.
        :return: next configuration to test, subset-frag to use, tracking-vars
        :rtype: dict, int, dict
        '''
        logger.info('**Fabolas: Starting initialization')
        for self._it in range(0, self._n_init):
            logger.debug('Fabolas: step ' + str(self._it) + ' (init)')
            start = time()
            result = self._init_models()
            tracking = {'overhead_time': time() - start}
            logger.debug(
                'Fabolas: needed {t!s}s'.format(t=tracking['overhead_time']))
            yield self._create_param_dict(result, tracking)

        self._X = np.array(self._X)
        self._Y = np.array(self._Y)
        self._cost = np.array(self._cost)

        logger.info('**Fabolas: Starting optimization')
        for self._it in range(self._n_init, self._num_iterations):
            logger.debug('Fabolas: step ' + str(self._it) + ' (opt)')
            start = time()
            result = self._optimize_config()
            tracking = {'overhead_time': time() - start}
            logger.debug(
                'Fabolas: needed {t!s}s'.format(t=tracking['overhead_time']))
            yield self._create_param_dict(result, tracking)

        logger.info('Fabolas: Final config')
        start = time()
        self._model_objective.train(self._X, self._Y, do_optimize=True)
        result = self.get_incumbent()
        tracking = {'overhead_time': time() - start}
        logger.debug(
            'Fabolas: needed {t!s}s'.format(t=tracking['overhead_time']))
        yield self._create_param_dict(result, tracking)
Example #22
0
    def _run_tests(self, photon_name, element_type):

        # this is a sneaky hack to avoid circular imports of PipelineElement
        imported_module = importlib.import_module("photonai.base")
        desired_class = getattr(imported_module, "PipelineElement")
        custom_element = desired_class(photon_name)

        # check if has fit, transform, predict
        if not hasattr(custom_element.base_element, 'fit'):
            raise NotImplementedError(
                "Custom element does not implement fit() method.")

        if element_type == 'Transformer' and not hasattr(
                custom_element.base_element, 'transform'):
            raise NotImplementedError(
                "Custom element does not implement transform() method.")

        if element_type == 'Estimator' and not hasattr(
                custom_element.base_element, 'predict'):
            raise NotImplementedError(
                "Custom element does not implement predict() method.")

        # check if estimator is regressor or classifier
        if element_type == 'Estimator':
            if hasattr(custom_element, '_estimator_type'):
                est_type = getattr(custom_element, '_estimator_type')
                if est_type == "regressor":
                    X, y = load_boston(return_X_y=True)
                elif est_type == "classifier":
                    X, y = load_breast_cancer(return_X_y=True)
                else:
                    raise ValueError(
                        "Custom element does not specify whether it is a regressor or classifier. "
                        "Is {}".format(est_type))
            else:
                raise NotImplementedError(
                    "Custom element does not specify whether it is a regressor or classifier. "
                    "Consider inheritance from ClassifierMixin or RegressorMixin or set "
                    "_estimator_type manually.")
        else:
            X, y = load_boston(return_X_y=True)

        # try and test functionality
        kwargs = {'covariates': np.random.randn(len(y))}

        try:
            # test fit
            returned_element = custom_element.base_element.fit(X, y, **kwargs)
        except Exception as e:
            logger.info(
                "Not able to run tests on fit() method. Test data not compatible."
            )
            return e

        if not isinstance(returned_element,
                          custom_element.base_element.__class__):
            raise NotImplementedError("fit() method does not return self.")

        try:
            # test transform or predict (if base element does not implement transform method, predict should be called
            # by PipelineElement -> so we only need to test transform()
            if custom_element.needs_y:
                if element_type == 'Estimator':
                    raise NotImplementedError("Estimator should not need y.")
                Xt, yt, kwargst = custom_element.base_element.transform(
                    X, y, **kwargs)
                if 'covariates' not in kwargst.keys():
                    raise ValueError(
                        "Custom element does not correctly transform kwargs although needs_y is True. "
                        "If you change the number of samples in transform(), make sure to transform kwargs "
                        "respectively.")
                if not len(kwargst['covariates']) == len(X):
                    raise ValueError(
                        "Custom element is not returning the correct number of samples!"
                    )

            elif custom_element.needs_covariates:
                if element_type == 'Estimator':
                    yt, kwargst = custom_element.base_element.predict(
                        X, **kwargs)
                    if not len(yt) == len(y):
                        raise ValueError(
                            "Custom element is not returning the correct number of samples!"
                        )
                else:
                    Xt, kwargst = custom_element.base_element.transform(
                        X, **kwargs)

                    if not len(Xt) == len(X) or not len(
                            kwargst['covariates']) == len(X):
                        raise ValueError(
                            "Custom element is not returning the correct number of samples!"
                        )

            else:
                if element_type == 'Estimator':
                    yt = custom_element.base_element.predict(X)
                    if not len(yt) == len(y):
                        raise ValueError(
                            "Custom element is not returning the correct number of samples!"
                        )
                else:
                    Xt = custom_element.base_element.transform(X)
                    if not len(Xt) == len(X):
                        raise ValueError(
                            "Custom element is not returning the correct number of samples!"
                        )

        except ValueError as ve:
            if "too many values to unpack" in ve.args[0]:
                raise ValueError(
                    "Custom element does not return X, y and kwargs the way it should "
                    "according to needs_y and needs_covariates.")
            else:
                logger.info(ve.args[0])
                return ve
        except Exception as e:
            logger.info(e.args[0])
            logger.info(
                "Not able to run tests on transform() or predict() method. Test data probably not compatible."
            )
            return e

        logger.info('All tests on custom element passed.')
Example #23
0
    def fit(self, X, y=None, **kwargs):
        logger.photon_system_log('')
        logger.photon_system_log(
            '***************************************************************************************************************'
        )
        logger.photon_system_log('Outer Cross validation Fold {}'.format(
            self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr))
        logger.photon_system_log(
            '***************************************************************************************************************'
        )

        self._prepare_data(X, y, **kwargs)
        self._fit_dummy()
        self._generate_inner_folds()
        self._prepare_optimization()

        outer_fold_fit_start_time = datetime.datetime.now()
        self.best_metric_yet = None
        self.tested_config_counter = 0

        # distribute number of folds to encapsulated child hyperpipes
        # self.__distribute_cv_info_to_hyperpipe_children(num_of_folds=num_folds,
        #                                                 outer_fold_counter=outer_fold_counter)

        if self.cross_validaton_info.calculate_metrics_per_fold:
            self.fold_operation = FoldOperations.MEAN
        else:
            self.fold_operation = FoldOperations.RAW

        self.max_nr_of_configs = ''
        if hasattr(self.optimizer, 'n_configurations'):
            self.max_nr_of_configs = str(self.optimizer.n_configurations)

        if isinstance(self.optimizer, PhotonMasterOptimizer):
            self.optimizer.optimize()
        else:
            # do the optimizing
            for current_config in self.optimizer.ask:
                self.objective_function(current_config)

        logger.clean_info(
            '---------------------------------------------------------------------------------------------------------------'
        )
        logger.info(
            'Hyperparameter Optimization finished. Now finding best configuration .... '
        )
        print(self.tested_config_counter)
        # now go on with the best config found
        if self.tested_config_counter > 0:
            best_config_outer_fold = self.optimization_info.get_optimum_config(
                self.result_object.tested_config_list, self.fold_operation)

            if not best_config_outer_fold:
                raise Exception("No best config was found!")

            # ... and create optimal pipeline
            optimum_pipe = self.copy_pipe_fnc()
            if self.cache_updater is not None:
                self.cache_updater(optimum_pipe, self.cache_folder,
                                   "fixed_fold_id")
            optimum_pipe.caching = False
            # set self to best config
            optimum_pipe.set_params(**best_config_outer_fold.config_dict)

            # Todo: set all children to best config and inform to NOT optimize again, ONLY fit
            # for child_name, child_config in best_config_outer_fold_mdb.children_config_dict.items():
            #     if child_config:
            #         # in case we have a pipeline stacking we need to identify the particular subhyperpipe
            #         splitted_name = child_name.split('__')
            #         if len(splitted_name) > 1:
            #             stacking_element = self.optimum_pipe.named_steps[splitted_name[0]]
            #             pipe_element = stacking_element.elements[splitted_name[1]]
            #         else:
            #             pipe_element = self.optimum_pipe.named_steps[child_name]
            #         pipe_element.set_params(**child_config)
            #         pipe_element.is_final_fit = True

            # self.__distribute_cv_info_to_hyperpipe_children(reset=True)

            logger.debug(
                'Fitting model with best configuration of outer fold...')
            optimum_pipe.fit(self._validation_X, self._validation_y,
                             **self._validation_kwargs)

            self.result_object.best_config = best_config_outer_fold

            # save test performance
            best_config_performance_mdb = MDBInnerFold()
            best_config_performance_mdb.fold_nr = -99
            best_config_performance_mdb.number_samples_training = self._validation_y.shape[
                0]
            best_config_performance_mdb.number_samples_validation = self._test_y.shape[
                0]
            best_config_performance_mdb.feature_importances = optimum_pipe.feature_importances_

            if self.cross_validaton_info.eval_final_performance:
                # Todo: generate mean and std over outer folds as well. move this items to the top
                logger.info(
                    'Calculating best model performance on test set...')

                logger.debug('...scoring test data')
                test_score_mdb = InnerFoldManager.score(
                    optimum_pipe,
                    self._test_X,
                    self._test_y,
                    indices=self.cross_validaton_info.outer_folds[
                        self.outer_fold_id].test_indices,
                    metrics=self.optimization_info.metrics,
                    **self._test_kwargs)

                logger.debug('... scoring training data')

                train_score_mdb = InnerFoldManager.score(
                    optimum_pipe,
                    self._validation_X,
                    self._validation_y,
                    indices=self.cross_validaton_info.outer_folds[
                        self.outer_fold_id].train_indices,
                    metrics=self.optimization_info.metrics,
                    training=True,
                    **self._validation_kwargs)

                best_config_performance_mdb.training = train_score_mdb
                best_config_performance_mdb.validation = test_score_mdb

                print_double_metrics(train_score_mdb.metrics,
                                     test_score_mdb.metrics)
            else:

                def _copy_inner_fold_means(metric_dict):
                    # We copy all mean values from validation to the best config
                    # training
                    train_item_metrics = {}
                    for m in metric_dict:
                        if m.operation == str(self.fold_operation):
                            train_item_metrics[m.metric_name] = m.value
                    train_item = MDBScoreInformation()
                    train_item.metrics_copied_from_inner = True
                    train_item.metrics = train_item_metrics
                    return train_item

                # training
                best_config_performance_mdb.training = _copy_inner_fold_means(
                    best_config_outer_fold.metrics_train)
                # validation
                best_config_performance_mdb.validation = _copy_inner_fold_means(
                    best_config_outer_fold.metrics_test)

            # write best config performance to best config item
            self.result_object.best_config.best_config_score = best_config_performance_mdb

        logger.info('Computations in outer fold {} took {} minutes.'.format(
            self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr,
            (datetime.datetime.now() -
             outer_fold_fit_start_time).total_seconds() / 60))
Example #24
0
    def objective_function(self, current_config):
        if current_config is None:
            return
        logger.clean_info(
            '---------------------------------------------------------------------------------------------------------------'
        )
        self.tested_config_counter += 1

        if hasattr(self.optimizer, 'ask_for_pipe'):
            pipe_ctor = self.optimizer.ask_for_pipe()
        else:
            pipe_ctor = self.copy_pipe_fnc

        # self.__distribute_cv_info_to_hyperpipe_children(reset=True, config_counter=tested_config_counter)

        hp = InnerFoldManager(pipe_ctor,
                              current_config,
                              self.optimization_info,
                              self.cross_validaton_info,
                              self.outer_fold_id,
                              self.constraint_objects,
                              cache_folder=self.cache_folder,
                              cache_updater=self.cache_updater)

        # Test the configuration cross validated by inner_cv object
        current_config_mdb = hp.fit(self._validation_X, self._validation_y,
                                    **self._validation_kwargs)
        current_config_mdb.config_nr = self.tested_config_counter

        if not current_config_mdb.config_failed:
            metric_train = MDBHelper.get_metric(
                current_config_mdb, self.fold_operation,
                self.optimization_info.best_config_metric)
            metric_test = MDBHelper.get_metric(
                current_config_mdb,
                self.fold_operation,
                self.optimization_info.best_config_metric,
                train=False)

            if metric_train is None or metric_test is None:
                raise Exception(
                    "Config did not fail, but did not get any metrics either....!!?"
                )
            config_performance = (metric_train, metric_test)
            if self.best_metric_yet is None:
                self.best_metric_yet = config_performance
                self.current_best_config = current_config_mdb
            else:
                # check if we have the next superstar around that exceeds any old performance
                if self.optimization_info.maximize_metric:
                    if metric_test > self.best_metric_yet[1]:
                        self.best_metric_yet = config_performance
                        self.current_best_config.save_memory()
                        self.current_best_config = current_config_mdb
                    else:
                        current_config_mdb.save_memory()
                else:
                    if metric_test < self.best_metric_yet[1]:
                        self.best_metric_yet = config_performance
                        self.current_best_config.save_memory()
                        self.current_best_config = current_config_mdb
                    else:
                        current_config_mdb.save_memory()

            # Print Result for config
            computation_duration = current_config_mdb.computation_end_time - current_config_mdb.computation_start_time
            logger.info('Computed configuration ' +
                        str(self.tested_config_counter) + "/" +
                        self.max_nr_of_configs + " in " +
                        str(computation_duration))
            logger.info("Performance:             " +
                        self.optimization_info.best_config_metric +
                        " - Train: " + "%.4f" % config_performance[0] +
                        ", Validation: " + "%.4f" % config_performance[1])
            logger.info("Best Performance So Far: " +
                        self.optimization_info.best_config_metric +
                        " - Train: " + "%.4f" % self.best_metric_yet[0] +
                        ", Validation: " + "%.4f" % self.best_metric_yet[1])
        else:
            config_performance = (-1, -1)
            # Print Result for config
            logger.debug('...failed:')
            logger.error(current_config_mdb.config_error)

        # add config to result tree
        self.result_object.tested_config_list.append(current_config_mdb)

        # 3. inform optimizer about performance
        logger.debug(
            "Telling hyperparameter optimizer about recent performance.")
        if isinstance(self.optimizer, PhotonSlaveOptimizer):
            self.optimizer.tell(current_config, config_performance)
        logger.debug("Asking hyperparameter optimizer for new config.")

        if self.optimization_info.maximize_metric:
            return 1 - config_performance[1]
        else:
            return config_performance[1]
Example #25
0
 def prepare(self, pipeline_elements, maximize_metric):
     self.pipeline_elements = pipeline_elements
     self.ask = self.next_config_generator()
     self.param_grid = create_global_config_grid(self.pipeline_elements)
     logger.info("Grid Search generated " + str(len(self.param_grid)) +
                 " configurations")
Example #26
0
    def fit(self, X, y, **kwargs):
        """
        Iterates over cross-validation folds and trains the pipeline, then uses it for predictions.
        Calculates metrics per fold and averages them over fold.
        :param X: Training and test data
        :param y: Training and test targets
        :returns: configuration class for result tree that monitors training and test performance
        """

        # needed for testing Timeboxed Random Grid Search
        # time.sleep(35)

        config_item = MDBConfig()
        config_item.config_dict = self.params
        config_item.inner_folds = []
        config_item.metrics_test = []
        config_item.metrics_train = []
        config_item.computation_start_time = datetime.datetime.now()

        try:
            # do inner cv
            for idx, (inner_fold_id, inner_fold) in enumerate(
                    self.cross_validation_infos.inner_folds[
                        self.outer_fold_id].items()):

                train, test = inner_fold.train_indices, inner_fold.test_indices

                # split kwargs according to cross validation
                train_X, train_y, kwargs_cv_train = PhotonDataHelper.split_data(
                    X, y, kwargs, indices=train)
                test_X, test_y, kwargs_cv_test = PhotonDataHelper.split_data(
                    X, y, kwargs, indices=test)

                new_pipe = self.pipe()
                if self.cache_folder is not None and self.cache_updater is not None:
                    self.cache_updater(new_pipe, self.cache_folder,
                                       inner_fold_id)

                if not config_item.human_readable_config:
                    config_item.human_readable_config = PhotonPrintHelper.config_to_human_readable_dict(
                        new_pipe, self.params)
                    logger.clean_info(
                        json.dumps(config_item.human_readable_config,
                                   indent=4,
                                   sort_keys=True))

                job_data = InnerFoldManager.InnerCVJob(
                    pipe=new_pipe,
                    config=dict(self.params),
                    metrics=self.optimization_infos.metrics,
                    callbacks=self.optimization_constraints,
                    train_data=InnerFoldManager.JobData(
                        train_X, train_y, train, kwargs_cv_train),
                    test_data=InnerFoldManager.JobData(test_X, test_y, test,
                                                       kwargs_cv_test),
                )

                # only for unparallel processing
                # inform children in which inner fold we are
                # self.pipe.distribute_cv_info_to_hyperpipe_children(inner_fold_counter=fold_cnt)
                # self.mother_inner_fold_handle(fold_cnt)

                # --> write that output in InnerFoldManager!
                # logger.debug(config_item.human_readable_config)
                fold_nr = idx + 1
                logger.debug("calculating inner fold " + str(fold_nr) + "...")

                curr_test_fold, curr_train_fold = InnerFoldManager.fit_and_score(
                    job_data)
                logger.debug("Performance inner fold " + str(fold_nr))
                print_double_metrics(
                    curr_train_fold.metrics,
                    curr_test_fold.metrics,
                    photon_system_log=False,
                )

                durations = job_data.pipe.time_monitor

                self.update_config_item_with_inner_fold(
                    config_item=config_item,
                    fold_cnt=fold_nr,
                    curr_train_fold=curr_train_fold,
                    curr_test_fold=curr_test_fold,
                    time_monitor=durations,
                    feature_importances=new_pipe.feature_importances_,
                )

                if isinstance(self.optimization_constraints, list):
                    break_cv = 0
                    for cf in self.optimization_constraints:
                        if not cf.shall_continue(config_item):
                            logger.info(
                                "Skipped further cross validation after fold "
                                + str(fold_nr) +
                                " due to performance constraints in " +
                                cf.metric)
                            break_cv += 1
                            break
                    if break_cv > 0:
                        break
                elif self.optimization_constraints is not None:
                    if not self.optimization_constraints.shall_continue(
                            config_item):
                        logger.info(
                            "Skipped further cross validation after fold " +
                            str(fold_nr) +
                            " due to performance constraints in " + cf.metric)
                        break

            InnerFoldManager.process_fit_results(
                config_item,
                self.cross_validation_infos.calculate_metrics_across_folds,
                self.cross_validation_infos.calculate_metrics_per_fold,
                self.optimization_infos.metrics,
            )

        except Exception as e:
            if self.raise_error:
                raise e
            logger.error(e)
            logger.error(traceback.format_exc())
            traceback.print_exc()
            if not isinstance(e, Warning):
                config_item.config_failed = True
            config_item.config_error = str(e)
            warnings.warn("One test iteration of pipeline failed with error")

        logger.debug("...done with")
        logger.debug(
            json.dumps(config_item.human_readable_config,
                       indent=4,
                       sort_keys=True))

        config_item.computation_end_time = datetime.datetime.now()
        return config_item
Example #27
0
    def fit(self, X, y, **kwargs):

        self.pipe = self.hyperpipe_constructor()

        # we need a mongodb to collect the results!
        if not self.pipe.output_settings.mongodb_connect_url:
            raise ValueError(
                "MongoDB connection string must be given for permutation tests"
            )

        # Get all specified metrics
        best_config_metric = self.pipe.optimization.best_config_metric
        self.metrics = PermutationTest.manage_metrics(
            self.pipe.optimization.metrics, self.pipe.elements[-1],
            best_config_metric)

        # at first we do a reference optimization
        y_true = y

        # Run with true labels
        connect(self.pipe.output_settings.mongodb_connect_url,
                alias="photon_core")
        # Check if it already exists in DB
        try:
            existing_reference = MDBHyperpipe.objects.raw({
                'permutation_id':
                self.mother_permutation_id,
                'computation_completed':
                True
            }).first()
            if not existing_reference.permutation_test:
                existing_reference.permutation_test = MDBPermutationResults(
                    n_perms=self.n_perms)
                existing_reference.save()
            # check if all outer folds exist
            logger.info(
                "Found hyperpipe computation with true targets, skipping the optimization process with true targets"
            )
        except DoesNotExist:
            # if we havent computed the reference value do it:
            logger.info("Calculating Reference Values with true targets.")
            try:
                self.pipe.permutation_id = self.mother_permutation_id
                self.pipe.fit(X, y_true, **kwargs)
                self.pipe.results.computation_completed = True

                self.pipe.results.permutation_test = MDBPermutationResults(
                    n_perms=self.n_perms)
                self.pipe.results.save()
                existing_reference = self.pipe.results

            except Exception as e:
                if self.pipe.results is not None:
                    self.pipe.results.permutation_failed = str(e)
                    self.pipe.results.save()

        # check for sanity
        if not self.__validate_usability(existing_reference):
            raise RuntimeError(
                "Permutation Test is not adviced because results are not better than dummy. Aborting."
            )

        # find how many permutations have been computed already:
        # existing_permutations = MDBHyperpipe.objects.raw({'permutation_id': self.permutation_id,
        #                                                   'computation_completed': True}).count()
        existing_permutations = list(
            MDBHyperpipe.objects.raw({
                'permutation_id': self.permutation_id,
                'computation_completed': True
            }).only('name'))
        existing_permutations = [
            int(perm_run.name.split('_')[-1])
            for perm_run in existing_permutations
        ]

        # we do one more permutation that is left in case the last permutation runs broke, one for each parallel
        if len(existing_permutations) > 0:
            perms_todo = set(np.arange(
                self.n_perms)) - set(existing_permutations)
        else:
            perms_todo = np.arange(self.n_perms)

        logger.info(str(len(perms_todo)) + " permutation runs to do")

        if len(perms_todo) > 0:
            # create permutation labels
            np.random.seed(self.random_state)
            self.permutations = [
                np.random.permutation(y_true) for _ in range(self.n_perms)
            ]

            # Run parallel pool
            job_list = list()
            if self.n_processes > 1:
                try:

                    my_client = Client(threads_per_worker=1,
                                       n_workers=self.n_processes,
                                       processes=False)

                    for perm_run in perms_todo:
                        del_job = dask.delayed(
                            PermutationTest.run_parallelized_permutation)(
                                self.hyperpipe_constructor, X, perm_run,
                                self.permutations[perm_run],
                                self.permutation_id, self.verbosity, **kwargs)
                        job_list.append(del_job)

                    dask.compute(*job_list)

                finally:
                    my_client.close()
            else:
                for perm_run in perms_todo:
                    PermutationTest.run_parallelized_permutation(
                        self.hyperpipe_constructor, X, perm_run,
                        self.permutations[perm_run], self.permutation_id,
                        self.verbosity, **kwargs)

        perm_result = self._calculate_results(self.permutation_id)

        performance_df = pd.DataFrame(
            dict([(name, [i]) for name, i in perm_result.p_values.items()]))
        performance_df.to_csv(
            os.path.join(existing_reference.output_folder,
                         'permutation_test_results.csv'))
        return self
Example #28
0
 def collect_results(self, result):
     # This is called whenever foo_pool(i) returns a result.
     # result_list is modified only by the main process, not the pool workers.
     logger.info("Finished Permutation Run" + str(result))
Example #29
0
    def start_flask(storage, pipe_name):

        url = Investigator.__build_url(storage, pipe_name)
        logger.info("Your url is: " + url)
        Investigator.__delayed_browser(url)
        FlaskManager().run_app()
Example #30
0
    def _calculate_results(
            permutation_id,
            save_to_db=True,
            mongodb_path="mongodb://trap-umbriel:27017/photon_results"):

        logger.info("Calculating permutation test results")
        try:
            mother_permutation = PermutationTest.find_reference(
                mongodb_path, permutation_id)
            # mother_permutation = MDBHyperpipe.objects.raw({'permutation_id': PermutationTest.get_mother_permutation_id(permutation_id),
            #                                                'computation_completed': True}).first()

        except DoesNotExist:
            return None
        else:
            all_permutations = list(
                MDBHyperpipe.objects.raw({
                    'permutation_id': permutation_id,
                    'computation_completed': True
                }).project({'metrics_test': 1}))
            # all_permutations = MDBHyperpipe.objects.raw({'permutation_id': permutation_id,
            #                                              'computation_completed': True}).only('metrics_test')
            number_of_permutations = len(all_permutations)

            if number_of_permutations == 0:
                number_of_permutations = 1

            true_performances = dict([(m.metric_name, m.value)
                                      for m in mother_permutation.metrics_test
                                      if m.operation == "FoldOperations.MEAN"])

            perm_performances = dict()
            metric_list = list(
                set([m.metric_name for m in mother_permutation.metrics_test]))
            metrics = PermutationTest.manage_metrics(
                metric_list, None,
                mother_permutation.hyperpipe_info.best_config_metric)

            for _, metric in metrics.items():
                perm_performances[metric["name"]] = [
                    m.value for i in all_permutations for m in i.metrics_test
                    if m.metric_name == metric["name"]
                    and m.operation == "FoldOperations.MEAN"
                ]

            # Calculate p-value
            p = PermutationTest.calculate_p(
                true_performance=true_performances,
                perm_performances=perm_performances,
                metrics=metrics,
                n_perms=number_of_permutations)
            p_text = dict()
            for _, metric in metrics.items():
                if p[metric['name']] == 0:
                    p_text[metric['name']] = "p < {}".format(
                        str(1 / number_of_permutations))
                else:
                    p_text[metric['name']] = "p = {}".format(p[metric['name']])

            # Print results
            logger.clean_info("""
            Done with permutations...

            Results Permutation test
            ===============================================
            """)
            for _, metric in metrics.items():
                logger.clean_info("""
                    Metric: {}
                    True Performance: {}
                    p Value: {}

                """.format(metric['name'], true_performances[metric['name']],
                           p_text[metric['name']]))

            if save_to_db:
                # Write results to results object
                if mother_permutation.permutation_test is None:
                    perm_results = MDBPermutationResults(
                        n_perms=number_of_permutations)
                else:
                    perm_results = mother_permutation.permutation_test
                perm_results.n_perms_done = number_of_permutations
                results_all_metrics = list()
                for _, metric in metrics.items():
                    perm_metrics = MDBPermutationMetrics(
                        metric_name=metric['name'],
                        p_value=p[metric['name']],
                        metric_value=true_performances[metric['name']])
                    perm_metrics.values_permutations = perm_performances[
                        metric['name']]
                    results_all_metrics.append(perm_metrics)
                perm_results.metrics = results_all_metrics
                mother_permutation.permutation_test = perm_results
                mother_permutation.save()

            if mother_permutation.permutation_test is not None:
                n_perms = mother_permutation.permutation_test.n_perms
            else:
                # we guess?
                n_perms = 1000

            result = PermutationTest.PermutationResult(true_performances,
                                                       perm_performances, p,
                                                       number_of_permutations,
                                                       n_perms)

            return result