Ejemplo n.º 1
0
    def setUp(self):
        super(CachedPhotonPipelineTests, self).setUp()
        # Photon Version
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {'n_components': [3, 10, 50]}, random_state=3)
        svm = PipelineElement("SVC", {'kernel': ['rbf', 'linear']}, random_state=3)

        self.pipe = PhotonPipeline([('StandardScaler', ss),
                                    ('PCA', pca),
                                    ('SVC', svm)])

        self.pipe.caching = True
        self.pipe.fold_id = "12345643463434"
        CacheManager.clear_cache_files(self.cache_folder_path)
        self.pipe.cache_folder = self.cache_folder_path

        self.config1 = {'PCA__n_components': 4,
                        'SVC__C': 3,
                        'SVC__kernel': 'rbf'}

        self.config2 = {'PCA__n_components': 7,
                        'SVC__C': 1,
                        'SVC__kernel': 'linear'}

        self.X, self.y = load_breast_cancer(return_X_y=True)
Ejemplo n.º 2
0
    def setUp(self):
        super(CacheManagerTests, self).setUp()

        self.cache_man = CacheManager("123353423434", self.cache_folder_path)
        self.X, self.y, self.kwargs = np.array([1, 2, 3, 4, 5]), np.array([1, 2, 3, 4, 5]), {'covariates': [9, 8, 7, 6, 5]}

        self.config1 = {'PCA__n_components': 5,
                        'SVC__C': 3,
                        'SVC__kernel': 'rbf'}
        self.item_names = ["StandardScaler", "PCA", "SVC"]

        self.config2 = {'PCA__n_components': 20,
                        'SVC__C': 1,
                        'SVC__kernel': 'linear'}
Ejemplo n.º 3
0
 def fold_id(self, value):
     if value is None:
         self._fold_id = ''
         # we dont need group-wise caching if we have no inner fold id
         self.caching = False
         self.cache_man = None
     else:
         if self._fix_fold_id:
             self._fold_id = "fixed_fold_id"
         else:
             self._fold_id = str(value)
         self.caching = True
         self.cache_man = CacheManager(self._fold_id, self.cache_folder,
                                       self._parallel_use,
                                       self._single_subject_caching)
Ejemplo n.º 4
0
    def cache_folder(self, value):

        if not self._do_not_delete_cache_folder:
            self._cache_folder = value
        else:
            if isinstance(value, str) and not value.endswith("DND"):
                self._cache_folder = value + "DND"
            else:
                self._cache_folder = value

        if isinstance(self._cache_folder, str):
            self.caching = True
            if not os.path.isdir(self._cache_folder):
                os.makedirs(self._cache_folder)
            self.cache_man = CacheManager(self._fold_id, self.cache_folder,
                                          self._parallel_use,
                                          self._single_subject_caching)
        else:
            self.caching = False
Ejemplo n.º 5
0
    def test_single_subject_caching(self):

        nb = NeuroBranch("subject_caching_test")
        # increase complexity by adding batching
        nb += PipelineElement("ResampleImages", batch_size=4)

        test_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "../test_data/")
        X = AtlasLibrary().get_nii_files_from_folder(test_folder,
                                                     extension=".nii")
        y = np.random.randn(len(X))

        cache_folder = self.cache_folder_path
        cache_folder = os.path.join(cache_folder, "subject_caching_test")
        nb.base_element.cache_folder = cache_folder

        nr_of_expected_pickles_per_config = len(X)

        def transform_and_check_folder(config, expected_nr_of_files):
            nb.set_params(**config)
            nb.transform(X, y)
            nr_of_generated_cache_files = len(
                glob.glob(os.path.join(cache_folder, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files == expected_nr_of_files)

        # fit with first config
        # expect one cache file per input file
        transform_and_check_folder({"ResampleImages__voxel_size": 5},
                                   nr_of_expected_pickles_per_config)

        # after fitting with second config, we expect two times the number of input files to be in cache
        transform_and_check_folder({"ResampleImages__voxel_size": 10},
                                   2 * nr_of_expected_pickles_per_config)

        # fit with first config again, we expect to not have generate other cache files, because they exist
        transform_and_check_folder({"ResampleImages__voxel_size": 5},
                                   2 * nr_of_expected_pickles_per_config)

        # clean up afterwards
        CacheManager.clear_cache_files(cache_folder)
Ejemplo n.º 6
0
    def setUp(self):
        super(CacheManagerTests, self).setUp()

        self.cache_man = CacheManager("123353423434", self.cache_folder_path)
        self.X, self.y, self.kwargs = (
            np.array([1, 2, 3, 4, 5]),
            np.array([1, 2, 3, 4, 5]),
            {
                "covariates": [9, 8, 7, 6, 5]
            },
        )

        self.config1 = {
            "PCA__n_components": 5,
            "SVC__C": 3,
            "SVC__kernel": "rbf"
        }
        self.item_names = ["StandardScaler", "PCA", "SVC"]

        self.config2 = {
            "PCA__n_components": 20,
            "SVC__C": 1,
            "SVC__kernel": "linear"
        }
Ejemplo n.º 7
0
    def test_single_subject_caching(self):

        nb = ParallelBranch("subject_caching_test")
        # increase complexity by adding batching
        nb += PipelineElement.create("ResampleImages",
                                     StupidAdditionTransformer(), {},
                                     batch_size=4)

        cache_folder = self.cache_folder_path
        cache_folder = os.path.join(cache_folder, 'subject_caching_test')
        nb.base_element.cache_folder = cache_folder

        def transform_and_check_folder(config, expected_nr_of_files):
            nb.set_params(**config)
            nb.transform(self.X, self.y)
            nr_of_generated_cache_files = len(
                glob.glob(os.path.join(cache_folder, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files == expected_nr_of_files)

        # fit with first config
        # expect one cache file per input file
        transform_and_check_folder({'ResampleImages__voxel_size': 5},
                                   self.nr_of_expected_pickles_per_config)

        # after fitting with second config, we expect two times the number of input files to be in cache
        transform_and_check_folder({'ResampleImages__voxel_size': 10},
                                   2 * self.nr_of_expected_pickles_per_config)

        # fit with first config again, we expect to not have generate other cache files, because they exist
        transform_and_check_folder({'ResampleImages__voxel_size': 5},
                                   2 * self.nr_of_expected_pickles_per_config)

        # clean up afterwards
        CacheManager.clear_cache_files(cache_folder)
        CacheManager.clear_cache_files(self.tmp_folder_path, force_all=True)
Ejemplo n.º 8
0
class PhotonPipeline(_BaseComposition):
    def __init__(self, elements, random_state=False):
        self.elements = elements
        self.random_state = random_state
        self.current_config = None
        # caching stuff
        self.caching = False
        self._fold_id = None
        self._cache_folder = None
        self.time_monitor = {
            'fit': [],
            'transform_computed': [],
            'transform_cached': [],
            'predict': []
        }
        self.cache_man = None

        # helper for single subject caching
        self._single_subject_caching = False
        self._fix_fold_id = False
        self._do_not_delete_cache_folder = False
        self._parallel_use = False

        # helper for optimum pipe
        self._meta_information = None

        # used in parallelization
        self.skip_loading = False

    def set_lock(self, lock):
        self.cache_man.lock = lock

    @property
    def single_subject_caching(self):
        return self._single_subject_caching

    @single_subject_caching.setter
    def single_subject_caching(self, value: bool):
        if value:
            self._fix_fold_id = True
            self._do_not_delete_cache_folder = True
        else:
            self._fix_fold_id = False
            self._do_not_delete_cache_folder = False
        self._single_subject_caching = value

    @property
    def fold_id(self):
        return self._fold_id

    @fold_id.setter
    def fold_id(self, value):
        if value is None:
            self._fold_id = ''
            # we dont need group-wise caching if we have no inner fold id
            self.caching = False
            self.cache_man = None
        else:
            if self._fix_fold_id:
                self._fold_id = "fixed_fold_id"
            else:
                self._fold_id = str(value)
            self.caching = True
            self.cache_man = CacheManager(self._fold_id, self.cache_folder,
                                          self._parallel_use,
                                          self._single_subject_caching)

    @property
    def cache_folder(self):
        return self._cache_folder

    @cache_folder.setter
    def cache_folder(self, value):

        if not self._do_not_delete_cache_folder:
            self._cache_folder = value
        else:
            if isinstance(value, str) and not value.endswith("DND"):
                self._cache_folder = value + "DND"
            else:
                self._cache_folder = value

        if isinstance(self._cache_folder, str):
            self.caching = True
            if not os.path.isdir(self._cache_folder):
                os.makedirs(self._cache_folder)
            self.cache_man = CacheManager(self._fold_id, self.cache_folder,
                                          self._parallel_use,
                                          self._single_subject_caching)
        else:
            self.caching = False

    def get_params(self, deep=True):
        """Get parameters for this estimator.
        Parameters
        ----------
        deep : boolean, optional
            If True, will return the parameters for this estimator and
            contained subobjects that are estimators.
        Returns
        -------
        params : mapping of string to any
            Parameter names mapped to their values.
        """
        return self._get_params('elements', deep=deep)

    def set_params(self, **kwargs):
        """Set the parameters of this estimator.
        Valid parameter keys can be listed with ``get_params()``.
        Returns
        -------
        self
        """
        if self.current_config is not None and len(self.current_config) > 0:
            if kwargs is not None and len(kwargs) == 0:
                raise ValueError(
                    "Pipeline cannot set parameters to elements with an emtpy dictionary. Old values persist"
                )
        self.current_config = kwargs
        self._set_params('elements', **kwargs)

        return self

    def _validate_elements(self):
        names, estimators = zip(*self.elements)

        # validate names
        self._validate_names(names)

        # validate estimators
        transformers = estimators[:-1]
        estimator = estimators[-1]

        for t in transformers:
            if t is None:
                continue
            if not (hasattr(t, "fit") or not hasattr(t, "transform")):
                raise TypeError("All intermediate elements should be "
                                "transformers and implement fit and transform."
                                " '%s' (type %s) doesn't" % (t, type(t)))

        # We allow last estimator to be None as an identity transformation
        if estimator is not None and not hasattr(estimator, "fit"):
            raise TypeError("Last step of Pipeline should implement fit. "
                            "'%s' (type %s) doesn't" %
                            (estimator, type(estimator)))

    def fit(self, X, y=None, **kwargs):

        self._validate_elements()
        X, y, kwargs = self._caching_fit_transform(X, y, kwargs, fit=True)

        if self._final_estimator is not None:
            logger.debug('PhotonPipeline: Fitting ' +
                         self._final_estimator.name)
            fit_start_time = datetime.datetime.now()
            if self.random_state:
                self._final_estimator.random_state = self.random_state
            self._final_estimator.fit(X, y, **kwargs)
            n = PhotonDataHelper.find_n(X)
            fit_duration = (datetime.datetime.now() -
                            fit_start_time).total_seconds()
            self.time_monitor['fit'].append(
                (self.elements[-1][0], fit_duration, n))
        return self

    def check_for_numpy_array(self, list_object):
        # be compatible to list of (image-) files
        if isinstance(list_object, list):
            return np.asarray(list_object)
        else:
            return list_object

    def transform(self, X, y=None, **kwargs):
        """
        Calls transform on every step that offers a transform function
        including the last step if it has the transformer flag,
        and excluding the last step if it has the estimator flag but no transformer flag.

        Returns transformed X, y and kwargs
        """
        if self.single_subject_caching:
            initial_X = np.array(X)
        else:
            initial_X = None

        X, y, kwargs = self._caching_fit_transform(X, y, kwargs)

        if self._final_estimator is not None:
            if self._estimator_type is None:
                if self.caching:
                    X, y, kwargs = self.load_or_save_cached_data(
                        self._final_estimator.name,
                        X,
                        y,
                        kwargs,
                        self._final_estimator,
                        initial_X=initial_X)
                else:
                    logger.debug('PhotonPipeline: Transforming data with ' +
                                 self._final_estimator.name)
                    X, y, kwargs = self._final_estimator.transform(
                        X, y, **kwargs)

        return X, y, kwargs

    def load_or_save_cached_data(self,
                                 name,
                                 X,
                                 y,
                                 kwargs,
                                 transformer,
                                 fit=False,
                                 needed_for_further_computation=False,
                                 initial_X=None):
        if not self.single_subject_caching:
            # if we do it group-wise then its easy
            if self.skip_loading and not needed_for_further_computation:
                # check if data is already calculated
                if self.cache_man.check_cache(name):
                    # if so, do nothing
                    return X, y, kwargs
                else:
                    # otherwise, do the calculation and save it
                    cached_result = None
            else:
                start_time_for_loading = datetime.datetime.now()
                cached_result = self.cache_man.load_cached_data(name)

            if cached_result is None:
                X, y, kwargs = self._do_timed_fit_transform(
                    name, transformer, fit, X, y, **kwargs)

                start_time_saving = datetime.datetime.now()
                self.cache_man.save_data_to_cache(name, (X, y, kwargs))
                saving_duration = (datetime.datetime.now() -
                                   start_time_saving).total_seconds()
                self.time_monitor['transform_cached'].append(
                    (name, saving_duration, 1))
            else:
                X, y, kwargs = cached_result[0], cached_result[
                    1], cached_result[2]
                loading_duration = (datetime.datetime.now() -
                                    start_time_for_loading).total_seconds()
                n = PhotonDataHelper.find_n(X)
                self.time_monitor['transform_cached'].append(
                    (name, loading_duration, n))
            return X, y, kwargs
        else:
            # if we do it subject-wise we need to iterate and collect the results
            processed_X, processed_y, processed_kwargs = list(), list(), dict()
            X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list(
            ), list(), dict(), list()
            list_of_idx_cached, list_of_idx_non_cached = list(), list()

            nr = PhotonDataHelper.find_n(X)
            for start, stop in PhotonDataHelper.chunker(nr, 1):
                # split data in single entities, find key from first element = PATH to file
                X_key, _, _ = PhotonDataHelper.split_data(
                    initial_X, None, {}, start, stop)
                X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data(
                    X, y, kwargs, start, stop)
                self.cache_man.update_single_subject_state_info(X_key)

                # check if item has been processed
                if self.cache_man.check_cache(name):
                    list_of_idx_cached.append(start)
                else:
                    list_of_idx_non_cached.append(start)
                    X_uncached = PhotonDataHelper.stack_data_vertically(
                        X_uncached, X_batched)
                    y_uncached = PhotonDataHelper.stack_data_vertically(
                        y_uncached, y_batched)
                    initial_X_uncached = PhotonDataHelper.stack_data_vertically(
                        initial_X_uncached, X_key)
                    kwargs_uncached = PhotonDataHelper.join_dictionaries(
                        kwargs_uncached, kwargs_dict_batched)

            # now we know which part can be loaded and which part should be transformed
            # first apply the transformation to the group, then save it single-subject-wise
            if len(list_of_idx_non_cached) > 0:

                # apply transformation groupwise
                new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform(
                    name, transformer, fit, X_uncached, y_uncached,
                    **kwargs_uncached)

                # then save it single
                nr = PhotonDataHelper.find_n(new_group_X)
                for start in range(nr):
                    # split data in single entities
                    X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data(
                        new_group_X, new_group_y, new_group_kwargs, start,
                        start)
                    X_key, _, _ = PhotonDataHelper.split_data(
                        initial_X_uncached, None, {}, start, start)
                    # we save the data in relation to the input path (X_key = hash(input X))
                    self.cache_man.update_single_subject_state_info(X_key)

                    start_time_saving = datetime.datetime.now()
                    self.cache_man.save_data_to_cache(
                        name, (X_batched, y_batched, kwargs_dict_batched))
                    saving_duration = (datetime.datetime.now() -
                                       start_time_saving).total_seconds()
                    self.time_monitor['transform_cached'].append(
                        (name, saving_duration, 1))

                # we need to collect the data only when we want to load them
                # we can skip that process if we only want them to get into the cache (case: parallelisation)
                if not self.skip_loading or needed_for_further_computation:
                    # stack results
                    processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs

            # afterwards load everything that has been cached
            if len(list_of_idx_cached) > 0:
                if not self.skip_loading or needed_for_further_computation:
                    for cache_idx in list_of_idx_cached:
                        # we identify the data according to the input path (X before any transformation)
                        self.cache_man.update_single_subject_state_info(
                            [initial_X[cache_idx]])

                        # time the loading of the cached item
                        start_time_for_loading = datetime.datetime.now()
                        transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data(
                            name)
                        loading_duration = (
                            datetime.datetime.now() -
                            start_time_for_loading).total_seconds()
                        self.time_monitor['transform_cached'].append(
                            (name, loading_duration,
                             PhotonDataHelper.find_n(X)))

                        processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data(
                            processed_X, transformed_X, processed_y,
                            transformed_y, processed_kwargs,
                            transformed_kwargs)

            logger.debug(name + " loaded " + str(len(list_of_idx_cached)) +
                         " items from cache and computed " +
                         str(len(list_of_idx_non_cached)))
            if not self.skip_loading or needed_for_further_computation:
                # now sort the data in the correct order again
                processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data(
                    processed_X, processed_y, processed_kwargs,
                    PhotonDataHelper.stack_data_vertically(
                        list_of_idx_cached, list_of_idx_non_cached))

            return processed_X, processed_y, processed_kwargs

    def _do_timed_fit_transform(self, name, transformer, fit, X, y, **kwargs):

        n = PhotonDataHelper.find_n(X)
        if self.random_state:
            transformer.random_state = self.random_state

        if fit:
            logger.debug('PhotonPipeline: Fitting ' + transformer.name)
            fit_start_time = datetime.datetime.now()
            transformer.fit(X, y, **kwargs)
            fit_duration = (datetime.datetime.now() -
                            fit_start_time).total_seconds()
            self.time_monitor['fit'].append((name, fit_duration, n))

        logger.debug('PhotonPipeline: Transforming data with ' +
                     transformer.name)
        transform_start_time = datetime.datetime.now()
        X, y, kwargs = transformer.transform(X, y, **kwargs)
        transform_duration = (datetime.datetime.now() -
                              transform_start_time).total_seconds()
        self.time_monitor['transform_computed'].append(
            (name, transform_duration, n))
        return X, y, kwargs

    def _caching_fit_transform(self, X, y, kwargs, fit=False):

        if self.single_subject_caching:
            initial_X = np.array(X)
        else:
            initial_X = None

        if self.caching:
            # update infos, just in case
            self.cache_man.hash = self._fold_id
            self.cache_man.cache_folder = self.cache_folder
            if not self.single_subject_caching:
                self.cache_man.prepare([name for name, e in self.elements],
                                       self.current_config, X)
            else:
                self.cache_man.prepare([name for name, e in self.elements],
                                       self.current_config,
                                       single_subject_caching=True)
            last_cached_item = None

        # all elements except the last one
        num_steps = len(self.elements) - 1

        for num, (name, transformer) in enumerate(self.elements[:-1]):
            if not self.caching or self.current_config is None or \
                    (hasattr(transformer, 'skip_caching') and transformer.skip_caching):
                X, y, kwargs = self._do_timed_fit_transform(
                    name, transformer, fit, X, y, **kwargs)
            else:
                # load data when the first item occurs that needs new calculation
                if self.cache_man.check_cache(name):
                    # as long as we find something cached, we remember what it was
                    last_cached_item = name
                    # if it is the last step, we need to load the data now
                    if num + 1 == num_steps and not self.skip_loading:
                        X, y, kwargs = self.load_or_save_cached_data(
                            last_cached_item,
                            X,
                            y,
                            kwargs,
                            transformer,
                            fit,
                            initial_X=initial_X)
                else:
                    if last_cached_item is not None:
                        # we load the cached data when the first transformation on this data is upcoming
                        X, y, kwargs = self.load_or_save_cached_data(
                            last_cached_item,
                            X,
                            y,
                            kwargs,
                            transformer,
                            fit,
                            needed_for_further_computation=True,
                            initial_X=initial_X)
                    X, y, kwargs = self.load_or_save_cached_data(
                        name,
                        X,
                        y,
                        kwargs,
                        transformer,
                        fit,
                        initial_X=initial_X)

            # always work with numpy arrays to avoid checking for shape attribute
            X = self.check_for_numpy_array(X)
            y = self.check_for_numpy_array(y)

        return X, y, kwargs

    def predict(self, X, training=False, **kwargs):
        """
        Transforms the data for every step that offers a transform function
        and then calls the estimator with predict on transformed data.
        It returns the predictions made.

        In case the last step is no estimator, it returns the transformed data.
        """

        # first transform
        if not training:
            X, _, kwargs = self.transform(X, y=None, **kwargs)

        # then call predict on final estimator
        if self._final_estimator is not None:
            if self._final_estimator.is_estimator:
                logger.debug('PhotonPipeline: Predicting with ' +
                             self._final_estimator.name + ' ...')
                predict_start_time = datetime.datetime.now()
                y_pred = self._final_estimator.predict(X, **kwargs)
                predict_duration = (datetime.datetime.now() -
                                    predict_start_time).total_seconds()
                n = PhotonDataHelper.find_n(X)
                self.time_monitor['predict'].append(
                    (self.elements[-1][0], predict_duration, n))
                return y_pred
            else:
                return X
        else:
            return None

    def predict_proba(self, X, training: bool = False, **kwargs):
        if not training:
            X, _, kwargs = self.transform(X, y=None, **kwargs)

        if self._final_estimator is not None:
            if self._final_estimator.is_estimator:
                if hasattr(self._final_estimator, "predict_proba"):
                    if hasattr(self._final_estimator, 'needs_covariates'):
                        if self._final_estimator.needs_covariates:
                            return self._final_estimator.predict_proba(
                                X, **kwargs)
                        else:
                            return self._final_estimator.predict_proba(X)
                    else:
                        return self._final_estimator.predict_proba(X)

        raise NotImplementedError(
            "The final estimator does not have a predict_proba method")

    def inverse_transform(self, X, y=None, **kwargs):
        # simply use X to apply inverse_transform
        # does not work on any transformers changing y or kwargs!
        for name, transform in self.elements[::-1]:
            try:
                X, y, kwargs = transform.inverse_transform(X, y, **kwargs)
            except Exception as e:
                if isinstance(e, NotImplementedError):
                    return X, y, kwargs

        return X, y, kwargs

    def fit_transform(self, X, y=None, **kwargs):
        # return self.fit(X, y, **kwargs).transform(X, y, **kwargs)
        raise NotImplementedError(
            'fit_transform not yet implemented in PHOTON Pipeline')

    def fit_predict(self, X, y=None, **kwargs):
        raise NotImplementedError(
            'fit_predict not yet implemented in PHOTON Pipeline')

    def copy_me(self):
        pipeline_steps = []
        for item_name, item in self.elements:
            cpy = item.copy_me()
            if isinstance(cpy, list):
                for new_step in cpy:
                    pipeline_steps.append((new_step.name, new_step))
            else:
                pipeline_steps.append((cpy.name, cpy))
        new_pipe = PhotonPipeline(pipeline_steps)
        new_pipe.random_state = self.random_state
        return new_pipe

    @property
    def named_steps(self):
        return dict(self.elements)

    @property
    def _final_estimator(self):
        return self.elements[-1][1]

    @property
    def _estimator_type(self):
        return getattr(self._final_estimator, '_estimator_type')

    def clear_cache(self):
        if self.cache_man is not None:
            self.cache_man.clear_cache()

    def _add_preprocessing(self, preprocessing):
        if preprocessing:
            self.elements.insert(0, (preprocessing.name, preprocessing))

    @property
    def feature_importances_(self):
        return self.elements[-1][1].feature_importances_
Ejemplo n.º 9
0
    def test_combi_from_single_and_group_caching(self):

        # 1. load data
        test_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "../test_data/")
        X = AtlasLibrary().get_nii_files_from_folder(test_folder,
                                                     extension=".nii")
        nr_of_expected_pickles_per_config = len(X)
        y = np.random.randn(len(X))

        # 2. specify cache directories
        cache_folder_base = self.cache_folder_path
        cache_folder_neuro = os.path.join(cache_folder_base,
                                          "subject_caching_test")

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)

        # 3. set up Neuro Branch
        nb = NeuroBranch("SubjectCaching", nr_of_processes=3)
        # increase complexity by adding batching
        nb += PipelineElement("ResampleImages", batch_size=4)
        nb += PipelineElement("BrainMask", batch_size=4)
        nb.base_element.cache_folder = cache_folder_neuro

        # 4. setup usual pipeline
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {"n_components": [3, 10, 50]})
        svm = PipelineElement("SVR", {"kernel": ["rbf", "linear"]})

        pipe = PhotonPipeline([("NeuroBranch", nb), ("StandardScaler", ss),
                               ("PCA", pca), ("SVR", svm)])

        pipe.caching = True
        pipe.fold_id = "12345643463434"
        pipe.cache_folder = cache_folder_base

        def transform_and_check_folder(config, expected_nr_of_files_group,
                                       expected_nr_subject):
            pipe.set_params(**config)
            pipe.fit(X, y)
            nr_of_generated_cache_files = len(
                glob.glob(os.path.join(cache_folder_base, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files == expected_nr_of_files_group)

            nr_of_generated_cache_files_subject = len(
                glob.glob(os.path.join(cache_folder_neuro, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files_subject == expected_nr_subject)

        config1 = {
            "NeuroBranch__ResampleImages__voxel_size": 5,
            "PCA__n_components": 7,
            "SVR__C": 2,
        }
        config2 = {
            "NeuroBranch__ResampleImages__voxel_size": 3,
            "PCA__n_components": 4,
            "SVR__C": 5,
        }

        # first config we expect to have a cached_file for the standard scaler and the pca
        # and we expect to have two files (one resampler, one brain mask) for each input data
        transform_and_check_folder(config1, 2,
                                   2 * nr_of_expected_pickles_per_config)

        # second config we expect to have two cached_file for the standard scaler (one time for 5 voxel input and one
        # time for 3 voxel input) and two files two for the first and second config pcas,
        # and we expect to have 2 * nr of input data for resampler plus one time masker
        transform_and_check_folder(config2, 4,
                                   4 * nr_of_expected_pickles_per_config)

        # when we transform with the first config again, nothing should happen
        transform_and_check_folder(config1, 4,
                                   4 * nr_of_expected_pickles_per_config)

        # when we transform with an empty config, a new entry for pca and standard scaler should be generated, as well
        # as a new cache item for each input data from the neuro branch for each itemin the neuro branch
        with self.assertRaises(ValueError):
            transform_and_check_folder({}, 6,
                                       6 * nr_of_expected_pickles_per_config)

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)
Ejemplo n.º 10
0
class CacheManagerTests(PhotonBaseTest):
    def setUp(self):
        super(CacheManagerTests, self).setUp()

        self.cache_man = CacheManager("123353423434", self.cache_folder_path)
        self.X, self.y, self.kwargs = (
            np.array([1, 2, 3, 4, 5]),
            np.array([1, 2, 3, 4, 5]),
            {
                "covariates": [9, 8, 7, 6, 5]
            },
        )

        self.config1 = {
            "PCA__n_components": 5,
            "SVC__C": 3,
            "SVC__kernel": "rbf"
        }
        self.item_names = ["StandardScaler", "PCA", "SVC"]

        self.config2 = {
            "PCA__n_components": 20,
            "SVC__C": 1,
            "SVC__kernel": "linear"
        }

    def test_find_relevant_configuration_items(self):
        self.cache_man.prepare(pipe_elements=self.item_names,
                               X=self.X,
                               config=self.config1)
        relevant_items = {"PCA__n_components": 5}
        relevant_items_hash = hash(frozenset(relevant_items.items()))
        new_hash = self.cache_man._find_config_for_element("PCA")
        self.assertEqual(relevant_items_hash, new_hash)

    def test_empty_config(self):
        self.cache_man.prepare(pipe_elements=self.item_names,
                               X=self.X,
                               config={})
        relevant_items_hash = hash(frozenset({}.items()))
        new_hash = self.cache_man._find_config_for_element("PCA")
        self.assertEqual(relevant_items_hash, new_hash)

    def test_initial_transformation(self):
        self.cache_man.prepare(pipe_elements=self.item_names,
                               config=self.config1)
        result = self.cache_man.load_cached_data("PCA")
        self.assertEqual(result, None)

    def test_check_cache(self):
        self.cache_man.prepare(pipe_elements=self.item_names,
                               config=self.config1)
        self.assertFalse(self.cache_man.check_cache("PCA"))
        self.cache_man.save_data_to_cache("PCA", (self.X, self.y, self.kwargs))
        self.assertTrue(self.cache_man.check_cache("PCA"))

    def test_key_hash_equal(self):
        self.cache_man.prepare(pipe_elements=self.item_names,
                               config=self.config1)
        generator_1 = self.cache_man.generate_cache_key("PCA")
        generator_2 = self.cache_man.generate_cache_key("PCA")
        self.assertEqual(generator_1, generator_2)

    def test_saving_and_loading_transformation(self):
        self.cache_man.prepare(pipe_elements=self.item_names,
                               config=self.config1)
        self.cache_man.save_data_to_cache("PCA", (self.X, self.y, self.kwargs))

        self.assertTrue(len(self.cache_man.cache_index) == 1)
        for hash_key, cache_file in self.cache_man.cache_index.items():
            self.assertTrue(os.path.isfile(cache_file))

        result = self.cache_man.load_cached_data("PCA")
        self.assertTrue(result is not None)
        X_loaded, y_loaded, kwargs_loaded = result[0], result[1], result[2]
        self.assertTrue(np.array_equal(self.X, X_loaded))
        self.assertTrue(np.array_equal(self.y, y_loaded))
        self.assertTrue(
            np.array_equal(self.kwargs["covariates"],
                           kwargs_loaded["covariates"]))

    def test_clearing_folder(self):
        self.cache_man.clear_cache()
        self.assertTrue(
            len(glob.glob(os.path.join(self.cache_man.cache_folder, "*.p"))) ==
            0)
Ejemplo n.º 11
0
    def test_combi_from_single_and_group_caching(self):

        # 2. specify cache directories
        cache_folder_base = self.cache_folder_path
        cache_folder_neuro = os.path.join(cache_folder_base,
                                          'subject_caching_test')

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)

        # 3. set up Neuro Branch
        nb = ParallelBranch("SubjectCaching", nr_of_processes=3)
        # increase complexity by adding batching
        nb += PipelineElement.create("ResampleImages",
                                     StupidAdditionTransformer(), {},
                                     batch_size=4)
        nb.base_element.cache_folder = cache_folder_neuro

        # 4. setup usual pipeline
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {'n_components': [3, 10, 50]})
        svm = PipelineElement("SVR", {'kernel': ['rbf', 'linear']})

        pipe = PhotonPipeline([('NeuroBranch', nb), ('StandardScaler', ss),
                               ('PCA', pca), ('SVR', svm)])

        pipe.caching = True
        pipe.fold_id = "12345643463434"
        pipe.cache_folder = cache_folder_base

        def transform_and_check_folder(config, expected_nr_of_files_group,
                                       expected_nr_subject):
            pipe.set_params(**config)
            pipe.fit(self.X, self.y)
            nr_of_generated_cache_files = len(
                glob.glob(os.path.join(cache_folder_base, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files == expected_nr_of_files_group)

            nr_of_generated_cache_files_subject = len(
                glob.glob(os.path.join(cache_folder_neuro, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files_subject == expected_nr_subject)

        config1 = {
            'NeuroBranch__ResampleImages__voxel_size': 5,
            'PCA__n_components': 7,
            'SVR__C': 2
        }
        config2 = {
            'NeuroBranch__ResampleImages__voxel_size': 3,
            'PCA__n_components': 4,
            'SVR__C': 5
        }

        # first config we expect to have a cached_file for the standard scaler and the pca
        # and we expect to have two files (one resampler, one brain mask) for each input data
        transform_and_check_folder(config1, 2,
                                   self.nr_of_expected_pickles_per_config)

        # second config we expect to have two cached_file for the standard scaler (one time for 5 voxel input and one
        # time for 3 voxel input) and two files two for the first and second config pcas,
        # and we expect to have 2 * nr of input data for resampler plus one time masker
        transform_and_check_folder(config2, 4,
                                   2 * self.nr_of_expected_pickles_per_config)

        # when we transform with the first config again, nothing should happen
        transform_and_check_folder(config1, 4,
                                   2 * self.nr_of_expected_pickles_per_config)

        # when we transform with an empty config, a new entry for pca and standard scaler should be generated, as well
        # as a new cache item for each input data from the neuro branch for each itemin the neuro branch
        with self.assertRaises(ValueError):
            transform_and_check_folder({}, 6, 4 *
                                       self.nr_of_expected_pickles_per_config)

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)