コード例 #1
0
ファイル: neuro_branch.py プロジェクト: nkourkou/photon
    def apply_transform_parallelized(self, X):
        """

        :param X: the data to which the delegate should be applied in parallel
        """

        if self.nr_of_processes > 1:

            jobs_to_do = list()

            # distribute the data equally to all available cores
            number_of_items_to_process = PhotonDataHelper.find_n(X)
            number_of_items_for_each_core = int(
                np.ceil(number_of_items_to_process / self.nr_of_processes))
            logger.info("NeuroBranch " + self.name + ": Using " +
                        str(self.nr_of_processes) + " cores calculating " +
                        str(number_of_items_for_each_core) + " items each")
            for start, stop in PhotonDataHelper.chunker(
                    number_of_items_to_process, number_of_items_for_each_core):
                X_batched, _, _ = PhotonDataHelper.split_data(
                    X, None, {}, start, stop)

                # copy my pipeline
                new_pipe_mr = self.copy_me()
                new_pipe_copy = new_pipe_mr.base_element
                new_pipe_copy.cache_folder = self.base_element.cache_folder
                new_pipe_copy.skip_loading = True
                new_pipe_copy._parallel_use = True

                del_job = dask.delayed(NeuroBranch.parallel_application)(
                    new_pipe_copy, X_batched)
                jobs_to_do.append(del_job)

            dask.compute(*jobs_to_do)
コード例 #2
0
    def predict(self, X, training=False, **kwargs):
        """
        Transforms the data for every step that offers a transform function
        and then calls the estimator with predict on transformed data.
        It returns the predictions made.

        In case the last step is no estimator, it returns the transformed data.
        """

        # first transform
        if not training:
            X, _, kwargs = self.transform(X, y=None, **kwargs)

        # then call predict on final estimator
        if self._final_estimator is not None:
            if self._final_estimator.is_estimator:
                logger.debug('PhotonPipeline: Predicting with ' +
                             self._final_estimator.name + ' ...')
                predict_start_time = datetime.datetime.now()
                y_pred = self._final_estimator.predict(X, **kwargs)
                predict_duration = (datetime.datetime.now() -
                                    predict_start_time).total_seconds()
                n = PhotonDataHelper.find_n(X)
                self.time_monitor['predict'].append(
                    (self.elements[-1][0], predict_duration, n))
                return y_pred
            else:
                return X
        else:
            return None
コード例 #3
0
    def fit(self, X, y=None, **kwargs):

        self._validate_elements()
        X, y, kwargs = self._caching_fit_transform(X, y, kwargs, fit=True)

        if self._final_estimator is not None:
            logger.debug("PhotonPipeline: Fitting " + self._final_estimator.name)
            fit_start_time = datetime.datetime.now()
            if self.random_state:
                self._final_estimator.random_state = self.random_state
            self._final_estimator.fit(X, y, **kwargs)
            #todo after fit final_estimator actions by estimtor go
            n = PhotonDataHelper.find_n(X)
            fit_duration = (datetime.datetime.now() - fit_start_time).total_seconds()
            self.time_monitor["fit"].append((self.elements[-1][0], fit_duration, n))
        return self
コード例 #4
0
    def _do_timed_fit_transform(self, name, transformer, fit, X, y, **kwargs):

        n = PhotonDataHelper.find_n(X)
        if self.random_state:
            transformer.random_state = self.random_state

        if fit:
            logger.debug("PhotonPipeline: Fitting " + transformer.name)
            fit_start_time = datetime.datetime.now()
            transformer.fit(X, y, **kwargs)
            fit_duration = (datetime.datetime.now() - fit_start_time).total_seconds()
            self.time_monitor["fit"].append((name, fit_duration, n))

        logger.debug("PhotonPipeline: Transforming data with " + transformer.name)
        transform_start_time = datetime.datetime.now()
        X, y, kwargs = transformer.transform(X, y, **kwargs)
        transform_duration = (
            datetime.datetime.now() - transform_start_time
        ).total_seconds()
        self.time_monitor["transform_computed"].append((name, transform_duration, n))
        return X, y, kwargs
コード例 #5
0
    def load_or_save_cached_data(self,
                                 name,
                                 X,
                                 y,
                                 kwargs,
                                 transformer,
                                 fit=False,
                                 needed_for_further_computation=False,
                                 initial_X=None):
        if not self.single_subject_caching:
            # if we do it group-wise then its easy
            if self.skip_loading and not needed_for_further_computation:
                # check if data is already calculated
                if self.cache_man.check_cache(name):
                    # if so, do nothing
                    return X, y, kwargs
                else:
                    # otherwise, do the calculation and save it
                    cached_result = None
            else:
                start_time_for_loading = datetime.datetime.now()
                cached_result = self.cache_man.load_cached_data(name)

            if cached_result is None:
                X, y, kwargs = self._do_timed_fit_transform(
                    name, transformer, fit, X, y, **kwargs)

                start_time_saving = datetime.datetime.now()
                self.cache_man.save_data_to_cache(name, (X, y, kwargs))
                saving_duration = (datetime.datetime.now() -
                                   start_time_saving).total_seconds()
                self.time_monitor['transform_cached'].append(
                    (name, saving_duration, 1))
            else:
                X, y, kwargs = cached_result[0], cached_result[
                    1], cached_result[2]
                loading_duration = (datetime.datetime.now() -
                                    start_time_for_loading).total_seconds()
                n = PhotonDataHelper.find_n(X)
                self.time_monitor['transform_cached'].append(
                    (name, loading_duration, n))
            return X, y, kwargs
        else:
            # if we do it subject-wise we need to iterate and collect the results
            processed_X, processed_y, processed_kwargs = list(), list(), dict()
            X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list(
            ), list(), dict(), list()
            list_of_idx_cached, list_of_idx_non_cached = list(), list()

            nr = PhotonDataHelper.find_n(X)
            for start, stop in PhotonDataHelper.chunker(nr, 1):
                # split data in single entities, find key from first element = PATH to file
                X_key, _, _ = PhotonDataHelper.split_data(
                    initial_X, None, {}, start, stop)
                X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data(
                    X, y, kwargs, start, stop)
                self.cache_man.update_single_subject_state_info(X_key)

                # check if item has been processed
                if self.cache_man.check_cache(name):
                    list_of_idx_cached.append(start)
                else:
                    list_of_idx_non_cached.append(start)
                    X_uncached = PhotonDataHelper.stack_data_vertically(
                        X_uncached, X_batched)
                    y_uncached = PhotonDataHelper.stack_data_vertically(
                        y_uncached, y_batched)
                    initial_X_uncached = PhotonDataHelper.stack_data_vertically(
                        initial_X_uncached, X_key)
                    kwargs_uncached = PhotonDataHelper.join_dictionaries(
                        kwargs_uncached, kwargs_dict_batched)

            # now we know which part can be loaded and which part should be transformed
            # first apply the transformation to the group, then save it single-subject-wise
            if len(list_of_idx_non_cached) > 0:

                # apply transformation groupwise
                new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform(
                    name, transformer, fit, X_uncached, y_uncached,
                    **kwargs_uncached)

                # then save it single
                nr = PhotonDataHelper.find_n(new_group_X)
                for start in range(nr):
                    # split data in single entities
                    X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data(
                        new_group_X, new_group_y, new_group_kwargs, start,
                        start)
                    X_key, _, _ = PhotonDataHelper.split_data(
                        initial_X_uncached, None, {}, start, start)
                    # we save the data in relation to the input path (X_key = hash(input X))
                    self.cache_man.update_single_subject_state_info(X_key)

                    start_time_saving = datetime.datetime.now()
                    self.cache_man.save_data_to_cache(
                        name, (X_batched, y_batched, kwargs_dict_batched))
                    saving_duration = (datetime.datetime.now() -
                                       start_time_saving).total_seconds()
                    self.time_monitor['transform_cached'].append(
                        (name, saving_duration, 1))

                # we need to collect the data only when we want to load them
                # we can skip that process if we only want them to get into the cache (case: parallelisation)
                if not self.skip_loading or needed_for_further_computation:
                    # stack results
                    processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs

            # afterwards load everything that has been cached
            if len(list_of_idx_cached) > 0:
                if not self.skip_loading or needed_for_further_computation:
                    for cache_idx in list_of_idx_cached:
                        # we identify the data according to the input path (X before any transformation)
                        self.cache_man.update_single_subject_state_info(
                            [initial_X[cache_idx]])

                        # time the loading of the cached item
                        start_time_for_loading = datetime.datetime.now()
                        transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data(
                            name)
                        loading_duration = (
                            datetime.datetime.now() -
                            start_time_for_loading).total_seconds()
                        self.time_monitor['transform_cached'].append(
                            (name, loading_duration,
                             PhotonDataHelper.find_n(X)))

                        processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data(
                            processed_X, transformed_X, processed_y,
                            transformed_y, processed_kwargs,
                            transformed_kwargs)

            logger.debug(name + " loaded " + str(len(list_of_idx_cached)) +
                         " items from cache and computed " +
                         str(len(list_of_idx_non_cached)))
            if not self.skip_loading or needed_for_further_computation:
                # now sort the data in the correct order again
                processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data(
                    processed_X, processed_y, processed_kwargs,
                    PhotonDataHelper.stack_data_vertically(
                        list_of_idx_cached, list_of_idx_non_cached))

            return processed_X, processed_y, processed_kwargs