Ejemplo n.º 1
0
    def _fit_transform(self, X, y=None, **fit_params):
        """ fit and transform X by transforming it by every step in sequence """
        # shallow copy of steps - this should really be steps_
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)

        fit_params_steps = dict(
            (name, {}) for name, step in self.steps if step is not None)
        for pname, pval in six.iteritems(fit_params):
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        for step_idx, (name, transformer) in enumerate(self.steps):
            if transformer is None:
                pass
            else:
                if hasattr(memory, 'cachedir') and memory.cachedir is None:
                    # we do not clone when caching is disabled to preserve
                    # backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
                # Fit or load from cache the current transfomer
                Xt, fitted_transformer = fit_transform_one_cached(
                    cloned_transformer, None, Xt, y, **fit_params_steps[name])
                # Replace the transformer of the step with the fitted
                # transformer. This is necessary when loading the transformer
                # from the cache.
                self.steps[step_idx] = (name, fitted_transformer)

        return Xt
Ejemplo n.º 2
0
    def _get_trans_col(self, fm_keys):

        # Grab the right data files from the file mapping (casting to int!)
        data_files = [self.file_mapping[int(fm_key)] for fm_key in fm_keys]

        # Clone the base loader transformer
        cloned_transformer = clone(self.wrapper_transformer)

        # If a caching location is passed, create new load_cand_trans_c func
        if self.cache_loc is not None:
            memory = check_memory(self.cache_loc)
            load_and_trans_c = memory.cache(load_and_trans)
        else:
            load_and_trans_c = load_and_trans

        if self.wrapper_n_jobs == 1:
            X_trans_cols = get_trans_chunk(cloned_transformer, data_files,
                                           load_and_trans_c)
        else:
            chunks = self.get_chunks(data_files)

            X_trans_chunks =\
                Parallel(n_jobs=self.wrapper_n_jobs)(
                    delayed(get_trans_chunk)(
                        transformer=cloned_transformer,
                        data_files=chunk,
                        func=load_and_trans_c)
                    for chunk in chunks)

            X_trans_cols = []
            for chunk in X_trans_chunks:
                X_trans_cols += chunk

        return X_trans_cols
Ejemplo n.º 3
0
def test_check_memory():
    memory = check_memory("cache_directory")
    assert memory.cachedir == os.path.join('cache_directory', 'joblib')
    memory = check_memory(None)
    assert memory.cachedir is None
    dummy = DummyMemory()
    memory = check_memory(dummy)
    assert memory is dummy
    assert_raises_regex(ValueError, "'memory' should be None, a string or"
                        " have the same interface as joblib.Memory."
                        " Got memory='1' instead.", check_memory, 1)
    dummy = WrongDummyMemory()
    assert_raises_regex(ValueError, "'memory' should be None, a string or"
                        " have the same interface as joblib.Memory."
                        " Got memory='{}' instead.".format(dummy),
                        check_memory, dummy)
Ejemplo n.º 4
0
    def _build_tree(self, X):
        memory = check_memory(self.memory)

        if self.linkage == "ward" and self.affinity != "euclidean":
            raise ValueError(f"{self.affinity} was provided as affinity. "
                             f"Ward can only work with Euclidean distances.")
        if self.linkage not in _TREE_BUILDERS:
            raise ValueError(f"Unknown linkage type {self.linkage}. Valid "
                             f"options are {_TREE_BUILDERS.keys()}")
        tree_builder = _TREE_BUILDERS[self.linkage]

        # Construct the tree
        kwargs = {}
        if self.linkage != 'ward':
            kwargs['linkage'] = self.linkage
            kwargs['affinity'] = self.affinity

        out = memory.cache(tree_builder)(X,
                                         n_clusters=None,
                                         return_distance=True,
                                         **kwargs)

        # Scikit-learn's tree_builder returns a tuple (children,
        # n_connected_components, n_leaves, parent, distances)
        self.children_, _, self.n_leaves_, _, self.distances_ = out
Ejemplo n.º 5
0
    def _fit_local(self, X, y=None, **fit_params):
        # shallow copy of steps - this should really be steps_
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)

        fit_params_steps = dict(
            (name, {}) for name, step in self.steps if step is not None)
        for pname, pval in fit_params.items():
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        transformers = self.get_procs(ProcKind.PROCESS, with_name=True)
        for step_idx, (name, transformer) in enumerate(transformers):
            if transformer is None:
                pass
            else:
                if hasattr(memory, "cachedir") and memory.cachedir is None:
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)

                Xt, fitted_transformer = fit_transform_one_cached(
                    cloned_transformer, None, Xt, y, **fit_params_steps[name])
                self.steps[step_idx] = (name, fitted_transformer)
        if self._final_estimator is None:
            return Xt, {}
        name_estimator = self.get_procs(ProcKind.MODEL, with_name=True)[0]
        return Xt, fit_params_steps[name_estimator[0]]
Ejemplo n.º 6
0
def test_check_memory():
    memory = check_memory("cache_directory")
    assert_equal(memory.cachedir, os.path.join('cache_directory', 'joblib'))
    memory = check_memory(None)
    assert_equal(memory.cachedir, None)
    dummy = DummyMemory()
    memory = check_memory(dummy)
    assert memory is dummy
    assert_raises_regex(ValueError, "'memory' should be None, a string or"
                        " have the same interface as joblib.Memory."
                        " Got memory='1' instead.", check_memory, 1)
    dummy = WrongDummyMemory()
    assert_raises_regex(ValueError, "'memory' should be None, a string or"
                        " have the same interface as joblib.Memory."
                        " Got memory='{}' instead.".format(dummy),
                        check_memory, dummy)
Ejemplo n.º 7
0
    def __init__(self, **kwargs):
        """
        Base estimator with the following allowed keyword args

            memory (bool/str/joblib.Memory): The path or Memory for caching the computational
                results, default None means no cache.
            verbose (bool): Whether to show the progress of feature calculations.
            n_jobs (int): The number of parallel jobs. 0 means no parallel computations.
                If this value is set to negative or greater than the total cpu
                then n_jobs is set to the number of cpu on system.

        Args:
            **kwargs: keyword args that contain possibly memory (str/joblib.Memory),
                verbose (bool), n_jobs (int)
        """
        allowed_kwargs = ['memory', 'verbose', 'n_jobs']
        for k, v in kwargs.items():
            if k not in allowed_kwargs:
                raise TypeError("%s not allowed as kwargs" % (str(k)))
        memory = kwargs.get("memory", None)
        if isinstance(memory, bool):
            memory = tempfile.mkdtemp()
            logger.info("Created temporary directory %s" % memory)
        verbose = kwargs.get("verbose", False)
        n_jobs = kwargs.get("n_jobs", 0)
        self.memory = check_memory(memory)
        self.verbose = verbose
        # find out the number of parallel jobs
        if (n_jobs < 0) or (n_jobs > cpu_count()):
            n_jobs = cpu_count()
            logger.info(f"Using {n_jobs} jobs for computation")
        self.n_jobs = n_jobs
Ejemplo n.º 8
0
    def _fit(self, X, y=None, **fit_params):
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)
        fit_transform_resample_one_cached = memory.cache(
            _fit_transform_resample_one)

        fit_params_steps = {
            name: {}
            for name, step in self.steps if step is not None
        }
        for pname, pval in fit_params.items():
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        for step_idx, name, transformer in self._iter(with_final=False):
            if hasattr(memory, 'location'):
                # joblib >= 0.12
                if memory.location is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            elif hasattr(memory, 'cachedir'):
                # joblib < 0.11
                if memory.cachedir is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
            else:
                cloned_transformer = clone(transformer)
            # Fit or load from cache the current transfomer
            if (hasattr(cloned_transformer, "resample")
                    or hasattr(cloned_transformer, "fit_transform_resample")):
                if y is None:
                    X, fitted_transformer = fit_transform_one_cached(
                        cloned_transformer, None, X, y,
                        **fit_params_steps[name])
                else:
                    X, y, fitted_transformer = \
                        fit_transform_resample_one_cached(
                            cloned_transformer, None, X, y,
                            **fit_params_steps[name])
            else:
                X, fitted_transformer = fit_transform_one_cached(
                    cloned_transformer, None, X, y, **fit_params_steps[name])

            # Replace the transformer of the step with the fitted
            # transformer. This is necessary when loading the transformer
            # from the cache.
            self.steps[step_idx] = (name, fitted_transformer)
        if self._final_estimator == 'passthrough':
            return X, y, {}
        return X, y, fit_params_steps[self.steps[-1][0]]
Ejemplo n.º 9
0
    def fit(self, X, y):
        memory = check_memory(self.memory)
        cached_fit = memory.cache(_iraps_core_fit)
        iraps_core = clone(self.iraps_core)
        # allow pre-fitted iraps_core here
        if not hasattr(iraps_core, 'pvalues_'):
            iraps_core = cached_fit(iraps_core, X, y)
        self.iraps_core_ = iraps_core

        pvalues = as_float_array(iraps_core.pvalues_, copy=True)
        # why np.nan is here?
        pvalues[np.isnan(pvalues)] = np.finfo(pvalues.dtype).max

        fold_changes = as_float_array(iraps_core.fold_changes_, copy=True)
        fold_changes[np.isnan(fold_changes)] = 0.0

        base_values = as_float_array(iraps_core.base_values_, copy=True)

        p_thres = self.p_thres
        fc_thres = self.fc_thres
        occurrence = self.occurrence

        mask_0 = np.zeros(pvalues.shape, dtype=np.int32)
        # mark p_values less than the threashold
        mask_0[pvalues <= p_thres] = 1
        # mark fold_changes only when greater than the threashold
        mask_0[abs(fold_changes) < fc_thres] = 0

        # count the occurrence and mask greater than the threshold
        counts = mask_0.sum(axis=0)
        occurrence_thres = int(occurrence * iraps_core.n_iter)
        mask = np.zeros(counts.shape, dtype=bool)
        mask[counts >= occurrence_thres] = 1

        # generate signature
        fold_changes[mask_0 == 0] = 0.0
        signature = fold_changes[:, mask].sum(axis=0) / counts[mask]
        signature = np.vstack((signature, base_values[:, mask].mean(axis=0)))
        # It's not clearn whether min_size could impact prediction
        # performance
        if signature is None\
                or signature.shape[1] < self.min_signature_features:
            raise ValueError("The classifier got None signature or the number "
                             "of sinature feature is less than minimum!")

        self.signature_ = np.asarray(signature)
        self.mask_ = mask
        # TODO: support other discretize method: fixed value, upper
        # third quater, etc.
        self.discretize_value = y.mean() + y.std() * self.discretize
        if iraps_core.negative_thres > iraps_core.positive_thres:
            self.less_is_positive = True
        else:
            self.less_is_positive = False

        return self
Ejemplo n.º 10
0
    def _fit(self, X, y=None, **fit_params):
        """
        All of this stolen from scikit-learn except for
        "if name.startsiwth('pre_')..." at the bottom
        """
        # shallow copy of steps - this should really be steps_
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)

        fit_params_steps = dict((name, {}) for name, step in self.steps
                                if step is not None)
        for pname, pval in fit_params.items():
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
            if transformer is None:
                pass
            else:
                # For the HorizonTransformer right now.
                y_only = getattr(transformer, 'y_only', False)
                _Xt = y.copy() if y_only else Xt

                if hasattr(memory, 'cachedir') and memory.cachedir is None:
                    # we do not clone when caching is disabled to preserve
                    # backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
                # Fit or load from cache the current transfomer
                _Xt, fitted_transformer = fit_transform_one_cached(
                    cloned_transformer, None, _Xt, y,
                    **fit_params_steps[name])
                # Replace the transformer of the step with the fitted
                # transformer. This is necessary when loading the transformer
                # from the cache.
                self.steps[step_idx] = (name, fitted_transformer)

                if y_only:
                    y = _Xt
                else:
                    Xt = _Xt

                # This is so ugly :(
                if name.startswith('pre_') and not y_only:
                    y = transformer.transform(y[:, np.newaxis]).squeeze().copy()

        if self._final_estimator is None:
            return Xt, {}

        return Xt, fit_params_steps[self.steps[-1][0]], y
Ejemplo n.º 11
0
    def _fit(self, X, y=None, **fit_params_steps):
        # shallow copy of steps - this should really be steps_
        if hasattr(self, 'raw_steps_') and self.raw_steps_ is not None:  # pylint: disable=E0203
            # Let's reuse the previous training.
            self.steps = list(self.raw_steps_)  # pylint: disable=E0203
            self.raw_steps_ = list(self.raw_steps_)
        else:
            self.steps = list(self.steps)
            self.raw_steps_ = list(self.steps)

        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)

        for (step_idx, name,
             transformer) in self._iter(with_final=False,
                                        filter_passthrough=False):
            if (transformer is None or transformer == 'passthrough'):
                with _print_elapsed_time('Pipeline',
                                         self._log_message(step_idx)):
                    continue

            if hasattr(memory, 'location'):
                # joblib >= 0.12
                if memory.location is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            else:
                cloned_transformer = clone(transformer)

            # Fit or load from cache the current transformer
            x_train = X
            X, fitted_transformer = fit_transform_one_cached(
                cloned_transformer,
                X,
                y,
                None,
                message_clsname='Pipeline',
                message=self._log_message(step_idx),
                **fit_params_steps[name])
            # Replace the transformer of the step with the fitted
            # transformer. This is necessary when loading the transformer
            # from the cache.
            self.raw_steps_[step_idx] = (name, fitted_transformer)
            self.steps[step_idx] = (name,
                                    self._to_onnx(name, fitted_transformer,
                                                  x_train))
        return X
Ejemplo n.º 12
0
    def _fit(self, X, y=None, **fit_params_steps):
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(pipeline._fit_transform_one)
        fit_resample_one_cached = memory.cache(_fit_resample_one)

        for (step_idx, name,
             transformer) in self._iter(with_final=False,
                                        filter_passthrough=False,
                                        filter_resample=False):
            if transformer is None or transformer == "passthrough":
                with _print_elapsed_time("Pipeline",
                                         self._log_message(step_idx)):
                    continue

            try:
                # joblib >= 0.12
                mem = memory.location
            except AttributeError:
                mem = memory.cachedir
            finally:
                cloned_transformer = clone(transformer) if mem else transformer

            # Fit or load from cache the current transformer
            if hasattr(cloned_transformer, "transform") or hasattr(
                    cloned_transformer, "fit_transform"):
                X, fitted_transformer = fit_transform_one_cached(
                    cloned_transformer,
                    X,
                    y,
                    None,
                    message_clsname="Pipeline",
                    message=self._log_message(step_idx),
                    **fit_params_steps[name],
                )
            elif hasattr(cloned_transformer, "fit_resample"):
                X, y, fitted_transformer = fit_resample_one_cached(
                    cloned_transformer,
                    X,
                    y,
                    message_clsname="Pipeline",
                    message=self._log_message(step_idx),
                    **fit_params_steps[name],
                )
            # Replace the transformer of the step with the fitted
            # transformer. This is necessary when loading the transformer
            # from the cache.
            self.steps[step_idx] = (name, fitted_transformer)
        return X, y
    def _fit(self, X, y=None, **fit_params):
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)
        fit_resample_one_cached = memory.cache(_fit_resample_one)

        fit_params_steps = dict((name, {}) for name, step in self.steps
                                if step is not None)
        for pname, pval in six.iteritems(fit_params):
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        yt = y
        for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
            if transformer is None:
                pass
            else:
                if hasattr(memory, 'location'):
                    # joblib >= 0.12
                    if memory.location is None:
                        # we do not clone when caching is disabled to
                        # preserve backward compatibility
                        cloned_transformer = transformer
                    else:
                        cloned_transformer = clone(transformer)
                elif hasattr(memory, 'cachedir'):
                    # joblib < 0.11
                    if memory.cachedir is None:
                        # we do not clone when caching is disabled to
                        # preserve backward compatibility
                        cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
                # Fit or load from cache the current transfomer
                if (hasattr(cloned_transformer, "transform") or
                        hasattr(cloned_transformer, "fit_transform")):
                    Xt, fitted_transformer = fit_transform_one_cached(
                        cloned_transformer, None, Xt, yt,
                        **fit_params_steps[name])
                elif hasattr(cloned_transformer, "fit_resample"):
                    Xt, yt, fitted_transformer = fit_resample_one_cached(
                        cloned_transformer, Xt, yt, **fit_params_steps[name])
                # Replace the transformer of the step with the fitted
                # transformer. This is necessary when loading the transformer
                # from the cache.
                self.steps[step_idx] = (name, fitted_transformer)
        if self._final_estimator is None:
            return Xt, yt, {}
        return Xt, yt, fit_params_steps[self.steps[-1][0]]
Ejemplo n.º 14
0
    def _fit(self, X, y=None, **fit_params):
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)
        fit_resample_one_cached = memory.cache(_fit_resample_one)

        fit_params_steps = dict(
            (name, {}) for name, step in self.steps if step is not None)
        for pname, pval in six.iteritems(fit_params):
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        yt = y
        for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
            if transformer is None:
                pass
            else:
                if hasattr(memory, 'location'):
                    # joblib >= 0.12
                    if memory.location is None:
                        # we do not clone when caching is disabled to
                        # preserve backward compatibility
                        cloned_transformer = transformer
                    else:
                        cloned_transformer = clone(transformer)
                elif hasattr(memory, 'cachedir'):
                    # joblib < 0.11
                    if memory.cachedir is None:
                        # we do not clone when caching is disabled to
                        # preserve backward compatibility
                        cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
                # Fit or load from cache the current transfomer
                if (hasattr(cloned_transformer, "transform")
                        or hasattr(cloned_transformer, "fit_transform")):
                    Xt, fitted_transformer = fit_transform_one_cached(
                        cloned_transformer, None, Xt, yt,
                        **fit_params_steps[name])
                elif hasattr(cloned_transformer, "fit_resample"):
                    Xt, yt, fitted_transformer = fit_resample_one_cached(
                        cloned_transformer, Xt, yt, **fit_params_steps[name])
                # Replace the transformer of the step with the fitted
                # transformer. This is necessary when loading the transformer
                # from the cache.
                self.steps[step_idx] = (name, fitted_transformer)
        if self._final_estimator is None:
            return Xt, yt, {}
        return Xt, yt, fit_params_steps[self.steps[-1][0]]
Ejemplo n.º 15
0
    def _fit(self, X, y=None, **fit_params):

        # Get correct fit function as either with memory
        # caching, or just as is, if no cache loc passed.
        if self.cache_loc is not None:
            memory = check_memory(self.cache_loc)
            _fit_estimator_c = memory.cache(_fit_estimator)
        else:
            _fit_estimator_c = _fit_estimator

        # Fit the estimator
        self.estimator_ = _fit_estimator_c(estimator=self.estimator_,
                                           X=X[:, self.inds_],
                                           y=y,
                                           **fit_params)
Ejemplo n.º 16
0
    def fit_transform(self, X, y=None, mapping=None, **fit_params):

        if mapping is None:
            mapping = {}

        self._proc_mapping(mapping)

        inds = self.wrapper_inds_
        self.rest_inds_ = [i for i in range(X.shape[1]) if i not in inds]

        # Before fit, need to handle annoying categorical encoders case
        # where there is no default setting to set to all cols
        # It shouldn't hurt to set these for other transformers (hopefully...)
        self.wrapper_transformer_ = clone(self.wrapper_transformer)
        self.wrapper_transformer_.cols = [i for i in range(len(inds))]
        self.wrapper_transformer_.return_df = False

        if self.cache_loc is not None:
            memory = check_memory(self.cache_loc)
            _fit_transform_single_transformer_c =\
                memory.cache(_fit_transform_single_transformer)
        else:
            _fit_transform_single_transformer_c =\
                _fit_transform_single_transformer

        self.wrapper_transformer_, X_trans =\
            _fit_transform_single_transformer_c(
                transformer=self.wrapper_transformer_,
                X=X[:, inds],
                y=y)

        self._X_trans_inds = [i for i in range(X_trans.shape[1])]

        new_mapping = {}

        # Many to Many case
        for i in inds:
            new_mapping[i] = self._X_trans_inds

        for cnt in range(len(self.rest_inds_)):
            new_mapping[self.rest_inds_[cnt]] = len(self._X_trans_inds) + cnt

        self._out_mapping = new_mapping.copy()

        # Update mapping
        update_mapping(mapping, new_mapping)
        return np.hstack([X_trans, X[:, self.rest_inds_]])
Ejemplo n.º 17
0
    def _fit(self, X, y=None, **fit_params):
        '''Override this function from ScopeObjs parent class almost exactly
        as is, but passing Xs instead of X.'''

        # Get correct fit function as either with memory
        # caching, or just as is, if no cache loc passed.
        if self.cache_loc is not None:
            memory = check_memory(self.cache_loc)
            _fit_estimator_c = memory.cache(_fit_estimator)
        else:
            _fit_estimator_c = _fit_estimator

        # Fit the estimator
        self.estimator_ =\
            _fit_estimator_c(estimator=self.estimator_,
                             Xs=[X[:, inds] for inds in self.view_inds_],
                             y=y, **fit_params)
Ejemplo n.º 18
0
    def memory(self):
        # When no log callback function is given, change nothing.
        # Or, if the memory cache was changed, set it back to its original.
        if self._log_callback is None:
            if hasattr(self._memory, '_cache'):
                self._memory.cache = self._memory._cache
            return self._memory

        self._memory = check_memory(self._memory)

        # Overwrite cache function of memory such that it logs the
        # output when the function is called
        if not hasattr(self._memory, '_cache'):
            self._memory._cache = self._memory.cache
        self._memory.cache = _cache_with_function_log_statement(
            self._log_callback).__get__(self._memory, self._memory.__class__)
        return self._memory
Ejemplo n.º 19
0
def test_check_memory():
    memory = check_memory("cache_directory")
    assert memory.cachedir == os.path.join('cache_directory', 'joblib')
    memory = check_memory(None)
    assert memory.cachedir is None
    dummy = DummyMemory()
    memory = check_memory(dummy)
    assert memory is dummy

    msg = "'memory' should be None, a string or have the same interface as" \
          " joblib.Memory. Got memory='1' instead."
    with pytest.raises(ValueError, match=msg):
        check_memory(1)
    dummy = WrongDummyMemory()
    msg = "'memory' should be None, a string or have the same interface as" \
          " joblib.Memory. Got memory='{}' instead.".format(dummy)
    with pytest.raises(ValueError, match=msg):
        check_memory(dummy)
Ejemplo n.º 20
0
    def _get_trans_col(self, fm_keys):

        # Grab the right data files from the file mapping (casting to int!)
        try:
            data_files = [self.file_mapping[int(fm_key)] for fm_key in fm_keys]

        # Add error about if NaN found
        except ValueError:
            raise ValueError(
                'NaN error trying to load DataFile, make sure no missing DataFiles!'
            )

        # Clone the base loader
        cloned_estimator = clone(self.estimator)

        # If a caching location is passed, create new load_and_trans_c func
        if self.cache_loc is not None:
            memory = check_memory(self.cache_loc)
            load_and_trans_c = memory.cache(load_and_trans)
        else:
            load_and_trans_c = load_and_trans

        if self._n_jobs == 1:
            X_trans_cols = get_trans_chunk(cloned_estimator, data_files,
                                           load_and_trans_c)
        else:
            chunks = self.get_chunks(data_files)

            X_trans_chunks =\
                Parallel(n_jobs=self._n_jobs)(
                    delayed(get_trans_chunk)(
                        transformer=cloned_estimator,
                        data_files=chunk,
                        func=load_and_trans_c)
                    for chunk in chunks)

            X_trans_cols = []
            for chunk in X_trans_chunks:
                X_trans_cols += chunk

        return X_trans_cols
Ejemplo n.º 21
0
    def _fit(self, X, y=None, **fit_params):
        # shallow copy of steps - this should really be steps_
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_blend_one_cached = memory.cache(self._fit_blend_one)

        fit_params_steps = dict(
            (name, {}) for name, step in self.steps if step is not None)
        for pname, pval in six.iteritems(fit_params):
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        indexes = np.arange(X.shape[0])
        for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
            if transformer is None:
                pass
            else:
                if hasattr(memory, 'cachedir') and memory.cachedir is None:
                    # we do not clone when caching is disabled to preserve
                    # backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
                # Fit or load from cache the current transfomer
                Xt, fitted_transformer, indexes = fit_blend_one_cached(
                    cloned_transformer, Xt, y[indexes], None,
                    **fit_params_steps[name])
                # Replace the transformer of the step with the fitted
                # transformer. This is necessary when loading the transformer
                # from the cache.
                self.steps[step_idx] = (name, fitted_transformer)

        if self._final_estimator is None:
            return Xt, {}, indexes

        return Xt, fit_params_steps[self.steps[-1][0]], indexes
Ejemplo n.º 22
0
    def transform(self, X, y=None):
        memory = check_memory(self.memory)

        _transform_cached = memory.cache(self._transform)

        return _transform_cached(X, y)
Ejemplo n.º 23
0
    def _fit(self, X, y=None, **fit_params_steps):
        # shallow copy of steps - this should really be steps_
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)

        for (step_idx,
             name,
             transformer) in self._iter(with_final=False,
                                        filter_passthrough=False):
            if (transformer is None or transformer == 'passthrough'):
                with _print_elapsed_time('Pipeline',
                                         self._log_message(step_idx)):
                    continue

            if hasattr(memory, 'location'):
                # joblib >= 0.12
                if memory.location is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            elif hasattr(memory, 'cachedir'):
                # joblib < 0.11
                if memory.cachedir is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            else:
                cloned_transformer = clone(transformer)

            if not self._skip_transform(cloned_transformer):
                # if cloned_transformer.
                if isinstance(cloned_transformer, YTransformer):
                    y, fitted_transformer = fit_transform_one_cached(
                        cloned_transformer, y, X, None,
                        message_clsname='Pipeline',
                        message=self._log_message(step_idx),
                        **fit_params_steps[name])
                elif isinstance(cloned_transformer, XAndYTransformer) \
                        or isinstance(cloned_transformer, XOrYTransformer):
                    X, y, fitted_transformer = fit_transform_one_cached(
                        cloned_transformer, X, y, None,
                        message_clsname='Pipeline',
                        message=self._log_message(step_idx),
                        **fit_params_steps[name])
                else:
                    # Fit or load from cache the current transformer
                    X, fitted_transformer = fit_transform_one_cached(
                        cloned_transformer, X, y, None,
                        message_clsname='Pipeline',
                        message=self._log_message(step_idx),
                        **fit_params_steps[name])
            else:
                # do nothing if it is not trainmode and the trainonly wrapper set (true)
                fitted_transformer = cloned_transformer
            # Replace the transformer of the step with the fitted
            # transformer. This is necessary when loading the transformer
            # from the cache.
            self.steps[step_idx] = (name, fitted_transformer)
        return X, y
Ejemplo n.º 24
0
    def fit(self, X, y=None):
        """Fit the hierarchical clustering on the data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data. Shape [n_samples, n_features], or [n_samples,
            n_samples] if affinity=='precomputed'.

        y : Ignored

        Returns
        -------
        self
        """
        if (self.pooling_func != 'deprecated'
                and not isinstance(self, AgglomerationTransform)):
            warnings.warn(
                'Agglomerative "pooling_func" parameter is not used.'
                ' It has been deprecated in version 0.20 and will be'
                'removed in 0.22', DeprecationWarning)
        X = check_array(X, ensure_min_samples=2, estimator=self)
        memory = check_memory(self.memory)

        if self.n_clusters is not None and self.n_clusters <= 0:
            raise ValueError("n_clusters should be an integer greater than 0."
                             " %s was provided." % str(self.n_clusters))

        if not ((self.n_clusters is None) ^ (self.distance_threshold is None)):
            raise ValueError("Exactly one of n_clusters and "
                             "distance_threshold has to be set, and the other "
                             "needs to be None.")

        if (self.distance_threshold is not None
                and not self.compute_full_tree):
            raise ValueError("compute_full_tree must be True if "
                             "distance_threshold is set.")

        if self.linkage == "ward" and self.affinity != "euclidean":
            raise ValueError("%s was provided as affinity. Ward can only "
                             "work with euclidean distances." %
                             (self.affinity, ))

        if self.linkage not in _TREE_BUILDERS:
            raise ValueError("Unknown linkage type %s. "
                             "Valid options are %s" %
                             (self.linkage, _TREE_BUILDERS.keys()))
        tree_builder = _TREE_BUILDERS[self.linkage]

        connectivity = self.connectivity
        if self.connectivity is not None:
            if callable(self.connectivity):
                connectivity = self.connectivity(X)
            connectivity = check_array(connectivity,
                                       accept_sparse=['csr', 'coo', 'lil'])

        n_samples = len(X)
        compute_full_tree = self.compute_full_tree
        if self.connectivity is None:
            compute_full_tree = True
        if compute_full_tree == 'auto':
            if self.distance_threshold is not None:
                compute_full_tree = True
            else:
                # Early stopping is likely to give a speed up only for
                # a large number of clusters. The actual threshold
                # implemented here is heuristic
                compute_full_tree = self.n_clusters < max(100, .02 * n_samples)
        n_clusters = self.n_clusters
        if compute_full_tree:
            n_clusters = None

        # Construct the tree
        kwargs = {}
        if self.linkage != 'ward':
            kwargs['linkage'] = self.linkage
            kwargs['affinity'] = self.affinity

        distance_threshold = self.distance_threshold

        return_distance = distance_threshold is not None
        out = memory.cache(tree_builder)(X,
                                         connectivity,
                                         n_clusters=n_clusters,
                                         return_distance=return_distance,
                                         **kwargs)
        (self.children_, self.n_connected_components_, self.n_leaves_,
         parents) = out[:4]

        if distance_threshold is not None:
            distances = out[-1]
            self.distances_ = distances
            self.n_clusters_ = np.count_nonzero(
                distances >= distance_threshold) + 1
        else:
            self.n_clusters_ = self.n_clusters

        # Cut the tree
        if compute_full_tree:
            self.labels_ = _hc_cut(self.n_clusters_, self.children_,
                                   self.n_leaves_)
        else:
            labels = _hierarchical.hc_get_heads(parents, copy=False)
            # copy to avoid holding a reference on the original array
            labels = np.copy(labels[:n_samples])
            # Reassign cluster numbers
            self.labels_ = np.searchsorted(np.unique(labels), labels)
        return self
Ejemplo n.º 25
0
    def _fit(self, X, y=None, **fit_params_steps):
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(skpipeline._fit_transform_one)

        conf_score = None
        for (step_idx,
             name,
             transformer) in self._iter(with_final=False,
                                        filter_passthrough=False):
            if transformer is None or transformer == 'passthrough':
                with _print_elapsed_time('Pipeline',
                                         self._log_message(step_idx)):
                    continue

            if hasattr(memory, 'location'):
                # joblib >= 0.12
                if memory.location is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            elif hasattr(memory, 'cachedir'):
                # joblib < 0.11
                if memory.cachedir is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            else:
                cloned_transformer = clone(transformer)

            # Fit or load from cache the current transformer
            if hasattr(cloned_transformer, "transform") or hasattr(
                    cloned_transformer, "fit_transform"
            ):
                res, fitted_transformer = fit_transform_one_cached(
                    cloned_transformer, X, y, None,
                    message_clsname='Pipeline',
                    message=self._log_message(step_idx),
                    **fit_params_steps[name]
                )
                # This ugly if/else can be removed if Transformers return
                # additional values (i.e. `conf_score`) in dict. Can be
                # appended to `fit_params_steps` dict.
                if type(res) == tuple:
                    if len(res) == 3:
                        X, y, conf_score = res
                    elif len(res) == 2:
                        X, y = res
                else:
                    X = res

            # Replace the transformer of the step with the fitted
            # transformer. This is necessary when loading the transformer
            # from the cache.
            self.steps[step_idx] = (name, fitted_transformer)

        return X, y, conf_score
Ejemplo n.º 26
0
    def _fit(self, X, y=None, **fit_params):
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(pipeline._fit_transform_one)
        fit_resample_one_cached = memory.cache(_fit_resample_one)

        fit_params_steps = {
            name: {}
            for name, step in self.steps if step is not None
        }
        for pname, pval in fit_params.items():
            if '__' not in pname:
                raise ValueError(
                    "Pipeline.fit does not accept the {} parameter. "
                    "You can pass parameters to specific steps of your "
                    "pipeline using the stepname__parameter format, e.g. "
                    "`Pipeline.fit(X, y, logisticregression__sample_weight"
                    "=sample_weight)`.".format(pname))
            step, param = pname.split("__", 1)
            fit_params_steps[step][param] = pval
        for (step_idx, name,
             transformer) in self._iter(with_final=False,
                                        filter_passthrough=False,
                                        filter_resample=False):
            if (transformer is None or transformer == 'passthrough'):
                with _print_elapsed_time('Pipeline',
                                         self._log_message(step_idx)):
                    continue
            if hasattr(memory, "location"):
                # joblib >= 0.12
                if memory.location is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            elif hasattr(memory, "cachedir"):
                # joblib <= 0.11
                if memory.cachedir is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
            else:
                cloned_transformer = clone(transformer)
            # Fit or load from cache the current transfomer
            if hasattr(cloned_transformer, "transform") or hasattr(
                    cloned_transformer, "fit_transform"):
                X, fitted_transformer = fit_transform_one_cached(
                    cloned_transformer,
                    X,
                    y,
                    None,
                    message_clsname='Pipeline',
                    message=self._log_message(step_idx),
                    **fit_params_steps[name])
            elif hasattr(cloned_transformer, "fit_resample"):
                X, y, fitted_transformer = fit_resample_one_cached(
                    cloned_transformer,
                    X,
                    y,
                    message_clsname='Pipeline',
                    message=self._log_message(step_idx),
                    **fit_params_steps[name])
            # Replace the transformer of the step with the fitted
            # transformer. This is necessary when loading the transformer
            # from the cache.
            self.steps[step_idx] = (name, fitted_transformer)
        if self._final_estimator == "passthrough":
            return X, y, {}
        return X, y, fit_params_steps[self.steps[-1][0]]
Ejemplo n.º 27
0
    def transform(self, X, y=None):
        memory = check_memory(self.memory)

        _transform_cached = memory.cache(self._transform)

        return _transform_cached(X, y)
Ejemplo n.º 28
0
    def _fit(self,
             X_train,
             y_train,
             X_valid=None,
             y_valid=None,
             X_test=None,
             y_test=None):
        # shallow copy of steps - this should really be steps_
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)
        for (step_idx, name,
             transformer) in self._iter(with_final=False,
                                        filter_passthrough=False):
            if (transformer is None or transformer == 'passthrough'):
                continue

            if hasattr(memory, 'location'):
                # joblib >= 0.12
                if memory.location is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            elif hasattr(memory, 'cachedir'):
                # joblib < 0.11
                if memory.cachedir is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            else:
                cloned_transformer = clone(transformer)
            # Fit or load from cache the current transformer

            result, fitted_transformer = fit_transform_one_cached(
                cloned_transformer,
                X_train,
                y_train,
                X_valid,
                y_valid,
                X_test,
                y_test,
                self.resource_manager,
                message_clsname='Pipeline',
                message=self._log_message(step_idx))
            X_train = result["X_train"]
            X_valid = result.get("X_valid")
            X_test = result.get("X_test")
            y_train = result.get("y_train")
            # Replace the transformer of the step with the fitted
            # transformer. This is necessary when loading the transformer
            # from the cache.
            self.steps[step_idx] = (name, fitted_transformer)

        return {
            "X_train": X_train,
            "X_valid": X_valid,
            "X_test": X_test,
            "y_train": y_train
        }
Ejemplo n.º 29
0
    def _fit(self, X, y=None, **fit_params):
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(pipeline._fit_transform_one)
        fit_resample_one_cached = memory.cache(_fit_resample_one)

        fit_params_steps = {name: {} for name, step in self.steps if step is not None}
        for pname, pval in fit_params.items():
            if "__" not in pname:
                raise ValueError(
                    f"Pipeline.fit does not accept the {pname} parameter. "
                    "You can pass parameters to specific steps of your "
                    "pipeline using the stepname__parameter format, e.g. "
                    "`Pipeline.fit(X, y, logisticregression__sample_weight"
                    "=sample_weight)`."
                )
            step, param = pname.split("__", 1)
            fit_params_steps[step][param] = pval
        for (step_idx, name, transformer) in self._iter(
            with_final=False, filter_passthrough=False, filter_resample=False
        ):
            if transformer is None or transformer == "passthrough":
                with _print_elapsed_time("Pipeline", self._log_message(step_idx)):
                    continue

            try:
                # joblib >= 0.12
                mem = memory.location
            except AttributeError:
                mem = memory.cachedir
            finally:
                cloned_transformer = clone(transformer) if mem else transformer

            # Fit or load from cache the current transformer
            if hasattr(cloned_transformer, "transform") or hasattr(
                cloned_transformer, "fit_transform"
            ):
                X, fitted_transformer = fit_transform_one_cached(
                    cloned_transformer,
                    X,
                    y,
                    None,
                    message_clsname="Pipeline",
                    message=self._log_message(step_idx),
                    **fit_params_steps[name],
                )
            elif hasattr(cloned_transformer, "fit_resample"):
                X, y, fitted_transformer = fit_resample_one_cached(
                    cloned_transformer,
                    X,
                    y,
                    message_clsname="Pipeline",
                    message=self._log_message(step_idx),
                    **fit_params_steps[name],
                )
            # Replace the transformer of the step with the fitted
            # transformer. This is necessary when loading the transformer
            # from the cache.
            self.steps[step_idx] = (name, fitted_transformer)
        if self._final_estimator == "passthrough":
            return X, y, {}
        return X, y, fit_params_steps[self.steps[-1][0]]