Exemple #1
0
    def _prepare_data(self, X, y=None, **kwargs):
        logger.info(
            "Preparing data for outer fold "
            + str(self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr)
            + "..."
        )
        # Prepare Train and validation set data
        train_indices = self.cross_validaton_info.outer_folds[
            self.outer_fold_id
        ].train_indices
        test_indices = self.cross_validaton_info.outer_folds[
            self.outer_fold_id
        ].test_indices
        self._validation_X, self._validation_y, self._validation_kwargs = PhotonDataHelper.split_data(
            X, y, kwargs, indices=train_indices
        )
        self._test_X, self._test_y, self._test_kwargs = PhotonDataHelper.split_data(
            X, y, kwargs, indices=test_indices
        )

        # write numbers to database info object
        self.result_object.number_samples_validation = self._validation_y.shape[0]
        self.result_object.number_samples_test = self._test_y.shape[0]
        if self._pipe._estimator_type == "classifier":
            self.result_object.class_distribution_validation = FoldInfo.data_overview(
                self._validation_y
            )
            self.result_object.class_distribution_test = FoldInfo.data_overview(
                self._test_y
            )
Exemple #2
0
    def apply_transform_parallelized(self, X):
        """

        :param X: the data to which the delegate should be applied in parallel
        """

        if self.nr_of_processes > 1:

            jobs_to_do = list()

            # distribute the data equally to all available cores
            number_of_items_to_process = PhotonDataHelper.find_n(X)
            number_of_items_for_each_core = int(
                np.ceil(number_of_items_to_process / self.nr_of_processes))
            logger.info("NeuroBranch " + self.name + ": Using " +
                        str(self.nr_of_processes) + " cores calculating " +
                        str(number_of_items_for_each_core) + " items each")
            for start, stop in PhotonDataHelper.chunker(
                    number_of_items_to_process, number_of_items_for_each_core):
                X_batched, _, _ = PhotonDataHelper.split_data(
                    X, None, {}, start, stop)

                # copy my pipeline
                new_pipe_mr = self.copy_me()
                new_pipe_copy = new_pipe_mr.base_element
                new_pipe_copy.cache_folder = self.base_element.cache_folder
                new_pipe_copy.skip_loading = True
                new_pipe_copy._parallel_use = True

                del_job = dask.delayed(NeuroBranch.parallel_application)(
                    new_pipe_copy, X_batched)
                jobs_to_do.append(del_job)

            dask.compute(*jobs_to_do)
Exemple #3
0
        def objective_function_simple(self, cfg):
            cfg = {k: cfg[k] for k in cfg if cfg[k]}
            values = []

            train_indices = list(self.pipe.cross_validation.outer_folds.values(
            ))[0].train_indices
            self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data(
                self.X, self.y, kwargs=None, indices=train_indices)

            for inner_fold in list(
                    list(self.pipe.cross_validation.inner_folds.values())
                [0].values()):
                sc = PipelineElement("StandardScaler", {})
                pca = PipelineElement("PCA", {}, random_state=42)
                svc = PipelineElement("SVC", {}, random_state=42, gamma='auto')
                my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca),
                                          ('SVC', svc)])
                my_pipe.set_params(**cfg)
                my_pipe.fit(self._validation_X[inner_fold.train_indices, :],
                            self._validation_y[inner_fold.train_indices])
                values.append(
                    accuracy_score(
                        self._validation_y[inner_fold.test_indices],
                        my_pipe.predict(
                            self._validation_X[inner_fold.test_indices, :])))
            return 1 - np.mean(values)
Exemple #4
0
 def compute_learning_curves(self, new_pipe, train_X, train_y, train,
                             kwargs_cv_train, test_X, test_y, test,
                             kwargs_cv_test):
     self.cross_validation_infos.learning_curves_cut.transform()
     cut_range = [
         round(cut * train_X.shape[0])
         for cut in self.cross_validation_infos.learning_curves_cut.values
     ]
     learning_curves = []
     for i, cut in enumerate(cut_range[1:]):
         cut_indices = np.arange(cut)
         train_cut_X, train_cut_y, train_cut_kwargs = PhotonDataHelper.split_data(
             train_X, train_y, kwargs_cv_train, indices=cut_indices)
         train_cut = train[:cut]
         job_data = self.InnerCVJob(
             pipe=new_pipe,
             config=dict(self.params),
             metrics=self.optimization_infos.metrics,
             callbacks=self.optimization_constraints,
             train_data=self.JobData(train_cut_X, train_cut_y, train_cut,
                                     train_cut_kwargs),
             test_data=self.JobData(test_X, test_y, test, kwargs_cv_test))
         curr_test_cut, curr_train_cut = InnerFoldManager.fit_and_score(
             job_data)
         learning_curves.append([
             self.cross_validation_infos.learning_curves_cut.values[i],
             curr_test_cut.metrics, curr_train_cut.metrics
         ])
     return learning_curves
Exemple #5
0
    def test_split_join_resorting(self):
        X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2])
        kwargs = {"test": np.array([-1, -2, -3, -4, -5, -6, -7, -8, -9, -10])}

        X_new, y_new, kwargs_new = list(), list(), dict()

        # first randomly split the data and append them to X_new, y_new, kwargs_new
        idx_list_one, idx_list_two = list(), list()
        for idx in range(len(X)):
            if bool(random.getrandbits(1)):
                idx_list_one.append(idx)
            else:
                idx_list_two.append(idx)

        for ilist in [idx_list_two, idx_list_one]:
            for idx in ilist:

                X_batched, y_batched, kwargs_batched = PhotonDataHelper.split_data(
                    X, y, kwargs, idx, idx)

                # test if batching works
                self.assertEqual(X_batched, X[idx])
                self.assertEqual(y_batched, y[idx])
                self.assertDictEqual(kwargs_batched,
                                     {"test": [kwargs["test"][idx]]})

                # then join again
                X_new, y_new, kwargs_new = PhotonDataHelper.join_data(
                    X_new, X_batched, y_new, y_batched, kwargs_new,
                    kwargs_batched)

        # test if joining works
        joined_idx = PhotonDataHelper.stack_data_vertically(
            idx_list_two, idx_list_one)
        self.assertTrue(np.array_equal(X_new, X[joined_idx]))
        self.assertTrue(np.array_equal(y_new, y[joined_idx]))
        self.assertTrue(
            np.array_equal(kwargs_new["test"], kwargs["test"][joined_idx]))

        # now resort and see if that works too
        X_resorted, y_resorted, kwargs_resorted = PhotonDataHelper.resort_splitted_data(
            X_new, y_new, kwargs_new, joined_idx)
        self.assertTrue(np.array_equal(X_resorted, X))
        self.assertTrue(np.array_equal(y_resorted, y))
        self.assertListEqual(list(kwargs_resorted.keys()), list(kwargs.keys()))
        self.assertTrue(np.array_equal(kwargs_resorted["test"],
                                       kwargs["test"]))
Exemple #6
0
 def test_data_split_indices(self):
     vals = np.array([-1, -2, -3, -4, -5, -6, -7, -8, -9, -10])
     vals_str = np.array([ascii(i) for i in vals])
     random_features = np.random.randn(10, 20)
     kwargs = {"test": vals, "subtest": vals_str, "random": random_features}
     pick_list = [1, 3, 5]
     splitted_X, splitted_y, splitted_example = PhotonDataHelper.split_data(
         random_features, vals, kwargs, indices=pick_list)
     self.assertTrue(np.array_equal(splitted_X, random_features[pick_list]))
     self.assertTrue(np.array_equal(splitted_y, vals[pick_list]))
     self.assertTrue(
         np.array_equal(splitted_example["test"], vals[pick_list]))
     self.assertTrue(
         np.array_equal(splitted_example["subtest"], vals_str[pick_list]))
     self.assertTrue(
         np.array_equal(splitted_example["random"],
                        random_features[pick_list]))
Exemple #7
0
        def objective_function_switch(self, cfg):
            cfg = {k: cfg[k] for k in cfg if cfg[k]}
            values = []

            train_indices = list(self.pipe.cross_validation.outer_folds.values(
            ))[0].train_indices
            self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data(
                self.X, self.y, kwargs=None, indices=train_indices)

            switch = cfg["Estimator_switch"]
            del cfg["Estimator_switch"]
            for inner_fold in list(
                    list(self.pipe.cross_validation.inner_folds.values())
                [0].values()):
                sc = PipelineElement("StandardScaler", {})
                pca = PipelineElement("PCA", {}, random_state=42)
                if switch == 'svc':
                    est = PipelineElement("SVC", {},
                                          random_state=42,
                                          gamma='auto')
                    name = 'SVC'
                else:
                    est = PipelineElement("RandomForestClassifier", {},
                                          random_state=42)
                    name = "RandomForestClassifier"
                my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca),
                                          (name, est)])
                my_pipe.set_params(**cfg)
                my_pipe.fit(self._validation_X[inner_fold.train_indices, :],
                            self._validation_y[inner_fold.train_indices])
                values.append(
                    accuracy_score(
                        self._validation_y[inner_fold.test_indices],
                        my_pipe.predict(
                            self._validation_X[inner_fold.test_indices, :])))
            return 1 - np.mean(values)
Exemple #8
0
    def load_or_save_cached_data(self,
                                 name,
                                 X,
                                 y,
                                 kwargs,
                                 transformer,
                                 fit=False,
                                 needed_for_further_computation=False,
                                 initial_X=None):
        if not self.single_subject_caching:
            # if we do it group-wise then its easy
            if self.skip_loading and not needed_for_further_computation:
                # check if data is already calculated
                if self.cache_man.check_cache(name):
                    # if so, do nothing
                    return X, y, kwargs
                else:
                    # otherwise, do the calculation and save it
                    cached_result = None
            else:
                start_time_for_loading = datetime.datetime.now()
                cached_result = self.cache_man.load_cached_data(name)

            if cached_result is None:
                X, y, kwargs = self._do_timed_fit_transform(
                    name, transformer, fit, X, y, **kwargs)

                start_time_saving = datetime.datetime.now()
                self.cache_man.save_data_to_cache(name, (X, y, kwargs))
                saving_duration = (datetime.datetime.now() -
                                   start_time_saving).total_seconds()
                self.time_monitor['transform_cached'].append(
                    (name, saving_duration, 1))
            else:
                X, y, kwargs = cached_result[0], cached_result[
                    1], cached_result[2]
                loading_duration = (datetime.datetime.now() -
                                    start_time_for_loading).total_seconds()
                n = PhotonDataHelper.find_n(X)
                self.time_monitor['transform_cached'].append(
                    (name, loading_duration, n))
            return X, y, kwargs
        else:
            # if we do it subject-wise we need to iterate and collect the results
            processed_X, processed_y, processed_kwargs = list(), list(), dict()
            X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list(
            ), list(), dict(), list()
            list_of_idx_cached, list_of_idx_non_cached = list(), list()

            nr = PhotonDataHelper.find_n(X)
            for start, stop in PhotonDataHelper.chunker(nr, 1):
                # split data in single entities, find key from first element = PATH to file
                X_key, _, _ = PhotonDataHelper.split_data(
                    initial_X, None, {}, start, stop)
                X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data(
                    X, y, kwargs, start, stop)
                self.cache_man.update_single_subject_state_info(X_key)

                # check if item has been processed
                if self.cache_man.check_cache(name):
                    list_of_idx_cached.append(start)
                else:
                    list_of_idx_non_cached.append(start)
                    X_uncached = PhotonDataHelper.stack_data_vertically(
                        X_uncached, X_batched)
                    y_uncached = PhotonDataHelper.stack_data_vertically(
                        y_uncached, y_batched)
                    initial_X_uncached = PhotonDataHelper.stack_data_vertically(
                        initial_X_uncached, X_key)
                    kwargs_uncached = PhotonDataHelper.join_dictionaries(
                        kwargs_uncached, kwargs_dict_batched)

            # now we know which part can be loaded and which part should be transformed
            # first apply the transformation to the group, then save it single-subject-wise
            if len(list_of_idx_non_cached) > 0:

                # apply transformation groupwise
                new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform(
                    name, transformer, fit, X_uncached, y_uncached,
                    **kwargs_uncached)

                # then save it single
                nr = PhotonDataHelper.find_n(new_group_X)
                for start in range(nr):
                    # split data in single entities
                    X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data(
                        new_group_X, new_group_y, new_group_kwargs, start,
                        start)
                    X_key, _, _ = PhotonDataHelper.split_data(
                        initial_X_uncached, None, {}, start, start)
                    # we save the data in relation to the input path (X_key = hash(input X))
                    self.cache_man.update_single_subject_state_info(X_key)

                    start_time_saving = datetime.datetime.now()
                    self.cache_man.save_data_to_cache(
                        name, (X_batched, y_batched, kwargs_dict_batched))
                    saving_duration = (datetime.datetime.now() -
                                       start_time_saving).total_seconds()
                    self.time_monitor['transform_cached'].append(
                        (name, saving_duration, 1))

                # we need to collect the data only when we want to load them
                # we can skip that process if we only want them to get into the cache (case: parallelisation)
                if not self.skip_loading or needed_for_further_computation:
                    # stack results
                    processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs

            # afterwards load everything that has been cached
            if len(list_of_idx_cached) > 0:
                if not self.skip_loading or needed_for_further_computation:
                    for cache_idx in list_of_idx_cached:
                        # we identify the data according to the input path (X before any transformation)
                        self.cache_man.update_single_subject_state_info(
                            [initial_X[cache_idx]])

                        # time the loading of the cached item
                        start_time_for_loading = datetime.datetime.now()
                        transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data(
                            name)
                        loading_duration = (
                            datetime.datetime.now() -
                            start_time_for_loading).total_seconds()
                        self.time_monitor['transform_cached'].append(
                            (name, loading_duration,
                             PhotonDataHelper.find_n(X)))

                        processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data(
                            processed_X, transformed_X, processed_y,
                            transformed_y, processed_kwargs,
                            transformed_kwargs)

            logger.debug(name + " loaded " + str(len(list_of_idx_cached)) +
                         " items from cache and computed " +
                         str(len(list_of_idx_non_cached)))
            if not self.skip_loading or needed_for_further_computation:
                # now sort the data in the correct order again
                processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data(
                    processed_X, processed_y, processed_kwargs,
                    PhotonDataHelper.stack_data_vertically(
                        list_of_idx_cached, list_of_idx_non_cached))

            return processed_X, processed_y, processed_kwargs
Exemple #9
0
    def fit(self, X, y, **kwargs):
        """
        Iterates over cross-validation folds and trains the pipeline, then uses it for predictions.
        Calculates metrics per fold and averages them over fold.
        :param X: Training and test data
        :param y: Training and test targets
        :returns: configuration class for result tree that monitors training and test performance
        """

        # needed for testing Timeboxed Random Grid Search
        # time.sleep(35)

        config_item = MDBConfig()
        config_item.config_dict = self.params
        config_item.inner_folds = []
        config_item.metrics_test = []
        config_item.metrics_train = []
        config_item.computation_start_time = datetime.datetime.now()

        try:
            # do inner cv
            for idx, (inner_fold_id, inner_fold) in enumerate(
                    self.cross_validation_infos.inner_folds[
                        self.outer_fold_id].items()):

                train, test = inner_fold.train_indices, inner_fold.test_indices

                # split kwargs according to cross validation
                train_X, train_y, kwargs_cv_train = PhotonDataHelper.split_data(
                    X, y, kwargs, indices=train)
                test_X, test_y, kwargs_cv_test = PhotonDataHelper.split_data(
                    X, y, kwargs, indices=test)

                new_pipe = self.pipe()
                if self.cache_folder is not None and self.cache_updater is not None:
                    self.cache_updater(new_pipe, self.cache_folder,
                                       inner_fold_id)

                if not config_item.human_readable_config:
                    config_item.human_readable_config = PhotonPrintHelper.config_to_human_readable_dict(
                        new_pipe, self.params)
                    logger.clean_info(
                        json.dumps(config_item.human_readable_config,
                                   indent=4,
                                   sort_keys=True))

                job_data = InnerFoldManager.InnerCVJob(
                    pipe=new_pipe,
                    config=dict(self.params),
                    metrics=self.optimization_infos.metrics,
                    callbacks=self.optimization_constraints,
                    train_data=InnerFoldManager.JobData(
                        train_X, train_y, train, kwargs_cv_train),
                    test_data=InnerFoldManager.JobData(test_X, test_y, test,
                                                       kwargs_cv_test),
                )

                # only for unparallel processing
                # inform children in which inner fold we are
                # self.pipe.distribute_cv_info_to_hyperpipe_children(inner_fold_counter=fold_cnt)
                # self.mother_inner_fold_handle(fold_cnt)

                # --> write that output in InnerFoldManager!
                # logger.debug(config_item.human_readable_config)
                fold_nr = idx + 1
                logger.debug("calculating inner fold " + str(fold_nr) + "...")

                curr_test_fold, curr_train_fold = InnerFoldManager.fit_and_score(
                    job_data)
                logger.debug("Performance inner fold " + str(fold_nr))
                print_double_metrics(
                    curr_train_fold.metrics,
                    curr_test_fold.metrics,
                    photon_system_log=False,
                )

                durations = job_data.pipe.time_monitor

                self.update_config_item_with_inner_fold(
                    config_item=config_item,
                    fold_cnt=fold_nr,
                    curr_train_fold=curr_train_fold,
                    curr_test_fold=curr_test_fold,
                    time_monitor=durations,
                    feature_importances=new_pipe.feature_importances_,
                )

                if isinstance(self.optimization_constraints, list):
                    break_cv = 0
                    for cf in self.optimization_constraints:
                        if not cf.shall_continue(config_item):
                            logger.info(
                                "Skipped further cross validation after fold "
                                + str(fold_nr) +
                                " due to performance constraints in " +
                                cf.metric)
                            break_cv += 1
                            break
                    if break_cv > 0:
                        break
                elif self.optimization_constraints is not None:
                    if not self.optimization_constraints.shall_continue(
                            config_item):
                        logger.info(
                            "Skipped further cross validation after fold " +
                            str(fold_nr) +
                            " due to performance constraints in " + cf.metric)
                        break

            InnerFoldManager.process_fit_results(
                config_item,
                self.cross_validation_infos.calculate_metrics_across_folds,
                self.cross_validation_infos.calculate_metrics_per_fold,
                self.optimization_infos.metrics,
            )

        except Exception as e:
            if self.raise_error:
                raise e
            logger.error(e)
            logger.error(traceback.format_exc())
            traceback.print_exc()
            if not isinstance(e, Warning):
                config_item.config_failed = True
            config_item.config_error = str(e)
            warnings.warn("One test iteration of pipeline failed with error")

        logger.debug("...done with")
        logger.debug(
            json.dumps(config_item.human_readable_config,
                       indent=4,
                       sort_keys=True))

        config_item.computation_end_time = datetime.datetime.now()
        return config_item