def _fit_transform(self, X, y=None, **fit_params): """ fit and transform X by transforming it by every step in sequence """ # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) fit_params_steps = dict( (name, {}) for name, step in self.steps if step is not None) for pname, pval in six.iteritems(fit_params): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X for step_idx, (name, transformer) in enumerate(self.steps): if transformer is None: pass else: if hasattr(memory, 'cachedir') and memory.cachedir is None: # we do not clone when caching is disabled to preserve # backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer Xt, fitted_transformer = fit_transform_one_cached( cloned_transformer, None, Xt, y, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) return Xt
def _get_trans_col(self, fm_keys): # Grab the right data files from the file mapping (casting to int!) data_files = [self.file_mapping[int(fm_key)] for fm_key in fm_keys] # Clone the base loader transformer cloned_transformer = clone(self.wrapper_transformer) # If a caching location is passed, create new load_cand_trans_c func if self.cache_loc is not None: memory = check_memory(self.cache_loc) load_and_trans_c = memory.cache(load_and_trans) else: load_and_trans_c = load_and_trans if self.wrapper_n_jobs == 1: X_trans_cols = get_trans_chunk(cloned_transformer, data_files, load_and_trans_c) else: chunks = self.get_chunks(data_files) X_trans_chunks =\ Parallel(n_jobs=self.wrapper_n_jobs)( delayed(get_trans_chunk)( transformer=cloned_transformer, data_files=chunk, func=load_and_trans_c) for chunk in chunks) X_trans_cols = [] for chunk in X_trans_chunks: X_trans_cols += chunk return X_trans_cols
def test_check_memory(): memory = check_memory("cache_directory") assert memory.cachedir == os.path.join('cache_directory', 'joblib') memory = check_memory(None) assert memory.cachedir is None dummy = DummyMemory() memory = check_memory(dummy) assert memory is dummy assert_raises_regex(ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='1' instead.", check_memory, 1) dummy = WrongDummyMemory() assert_raises_regex(ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='{}' instead.".format(dummy), check_memory, dummy)
def _build_tree(self, X): memory = check_memory(self.memory) if self.linkage == "ward" and self.affinity != "euclidean": raise ValueError(f"{self.affinity} was provided as affinity. " f"Ward can only work with Euclidean distances.") if self.linkage not in _TREE_BUILDERS: raise ValueError(f"Unknown linkage type {self.linkage}. Valid " f"options are {_TREE_BUILDERS.keys()}") tree_builder = _TREE_BUILDERS[self.linkage] # Construct the tree kwargs = {} if self.linkage != 'ward': kwargs['linkage'] = self.linkage kwargs['affinity'] = self.affinity out = memory.cache(tree_builder)(X, n_clusters=None, return_distance=True, **kwargs) # Scikit-learn's tree_builder returns a tuple (children, # n_connected_components, n_leaves, parent, distances) self.children_, _, self.n_leaves_, _, self.distances_ = out
def _fit_local(self, X, y=None, **fit_params): # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) fit_params_steps = dict( (name, {}) for name, step in self.steps if step is not None) for pname, pval in fit_params.items(): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X transformers = self.get_procs(ProcKind.PROCESS, with_name=True) for step_idx, (name, transformer) in enumerate(transformers): if transformer is None: pass else: if hasattr(memory, "cachedir") and memory.cachedir is None: cloned_transformer = transformer else: cloned_transformer = clone(transformer) Xt, fitted_transformer = fit_transform_one_cached( cloned_transformer, None, Xt, y, **fit_params_steps[name]) self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator is None: return Xt, {} name_estimator = self.get_procs(ProcKind.MODEL, with_name=True)[0] return Xt, fit_params_steps[name_estimator[0]]
def test_check_memory(): memory = check_memory("cache_directory") assert_equal(memory.cachedir, os.path.join('cache_directory', 'joblib')) memory = check_memory(None) assert_equal(memory.cachedir, None) dummy = DummyMemory() memory = check_memory(dummy) assert memory is dummy assert_raises_regex(ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='1' instead.", check_memory, 1) dummy = WrongDummyMemory() assert_raises_regex(ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='{}' instead.".format(dummy), check_memory, dummy)
def __init__(self, **kwargs): """ Base estimator with the following allowed keyword args memory (bool/str/joblib.Memory): The path or Memory for caching the computational results, default None means no cache. verbose (bool): Whether to show the progress of feature calculations. n_jobs (int): The number of parallel jobs. 0 means no parallel computations. If this value is set to negative or greater than the total cpu then n_jobs is set to the number of cpu on system. Args: **kwargs: keyword args that contain possibly memory (str/joblib.Memory), verbose (bool), n_jobs (int) """ allowed_kwargs = ['memory', 'verbose', 'n_jobs'] for k, v in kwargs.items(): if k not in allowed_kwargs: raise TypeError("%s not allowed as kwargs" % (str(k))) memory = kwargs.get("memory", None) if isinstance(memory, bool): memory = tempfile.mkdtemp() logger.info("Created temporary directory %s" % memory) verbose = kwargs.get("verbose", False) n_jobs = kwargs.get("n_jobs", 0) self.memory = check_memory(memory) self.verbose = verbose # find out the number of parallel jobs if (n_jobs < 0) or (n_jobs > cpu_count()): n_jobs = cpu_count() logger.info(f"Using {n_jobs} jobs for computation") self.n_jobs = n_jobs
def _fit(self, X, y=None, **fit_params): self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) fit_transform_resample_one_cached = memory.cache( _fit_transform_resample_one) fit_params_steps = { name: {} for name, step in self.steps if step is not None } for pname, pval in fit_params.items(): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval for step_idx, name, transformer in self._iter(with_final=False): if hasattr(memory, 'location'): # joblib >= 0.12 if memory.location is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) elif hasattr(memory, 'cachedir'): # joblib < 0.11 if memory.cachedir is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer if (hasattr(cloned_transformer, "resample") or hasattr(cloned_transformer, "fit_transform_resample")): if y is None: X, fitted_transformer = fit_transform_one_cached( cloned_transformer, None, X, y, **fit_params_steps[name]) else: X, y, fitted_transformer = \ fit_transform_resample_one_cached( cloned_transformer, None, X, y, **fit_params_steps[name]) else: X, fitted_transformer = fit_transform_one_cached( cloned_transformer, None, X, y, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator == 'passthrough': return X, y, {} return X, y, fit_params_steps[self.steps[-1][0]]
def fit(self, X, y): memory = check_memory(self.memory) cached_fit = memory.cache(_iraps_core_fit) iraps_core = clone(self.iraps_core) # allow pre-fitted iraps_core here if not hasattr(iraps_core, 'pvalues_'): iraps_core = cached_fit(iraps_core, X, y) self.iraps_core_ = iraps_core pvalues = as_float_array(iraps_core.pvalues_, copy=True) # why np.nan is here? pvalues[np.isnan(pvalues)] = np.finfo(pvalues.dtype).max fold_changes = as_float_array(iraps_core.fold_changes_, copy=True) fold_changes[np.isnan(fold_changes)] = 0.0 base_values = as_float_array(iraps_core.base_values_, copy=True) p_thres = self.p_thres fc_thres = self.fc_thres occurrence = self.occurrence mask_0 = np.zeros(pvalues.shape, dtype=np.int32) # mark p_values less than the threashold mask_0[pvalues <= p_thres] = 1 # mark fold_changes only when greater than the threashold mask_0[abs(fold_changes) < fc_thres] = 0 # count the occurrence and mask greater than the threshold counts = mask_0.sum(axis=0) occurrence_thres = int(occurrence * iraps_core.n_iter) mask = np.zeros(counts.shape, dtype=bool) mask[counts >= occurrence_thres] = 1 # generate signature fold_changes[mask_0 == 0] = 0.0 signature = fold_changes[:, mask].sum(axis=0) / counts[mask] signature = np.vstack((signature, base_values[:, mask].mean(axis=0))) # It's not clearn whether min_size could impact prediction # performance if signature is None\ or signature.shape[1] < self.min_signature_features: raise ValueError("The classifier got None signature or the number " "of sinature feature is less than minimum!") self.signature_ = np.asarray(signature) self.mask_ = mask # TODO: support other discretize method: fixed value, upper # third quater, etc. self.discretize_value = y.mean() + y.std() * self.discretize if iraps_core.negative_thres > iraps_core.positive_thres: self.less_is_positive = True else: self.less_is_positive = False return self
def _fit(self, X, y=None, **fit_params): """ All of this stolen from scikit-learn except for "if name.startsiwth('pre_')..." at the bottom """ # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) fit_params_steps = dict((name, {}) for name, step in self.steps if step is not None) for pname, pval in fit_params.items(): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X for step_idx, (name, transformer) in enumerate(self.steps[:-1]): if transformer is None: pass else: # For the HorizonTransformer right now. y_only = getattr(transformer, 'y_only', False) _Xt = y.copy() if y_only else Xt if hasattr(memory, 'cachedir') and memory.cachedir is None: # we do not clone when caching is disabled to preserve # backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer _Xt, fitted_transformer = fit_transform_one_cached( cloned_transformer, None, _Xt, y, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if y_only: y = _Xt else: Xt = _Xt # This is so ugly :( if name.startswith('pre_') and not y_only: y = transformer.transform(y[:, np.newaxis]).squeeze().copy() if self._final_estimator is None: return Xt, {} return Xt, fit_params_steps[self.steps[-1][0]], y
def _fit(self, X, y=None, **fit_params_steps): # shallow copy of steps - this should really be steps_ if hasattr(self, 'raw_steps_') and self.raw_steps_ is not None: # pylint: disable=E0203 # Let's reuse the previous training. self.steps = list(self.raw_steps_) # pylint: disable=E0203 self.raw_steps_ = list(self.raw_steps_) else: self.steps = list(self.steps) self.raw_steps_ = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) for (step_idx, name, transformer) in self._iter(with_final=False, filter_passthrough=False): if (transformer is None or transformer == 'passthrough'): with _print_elapsed_time('Pipeline', self._log_message(step_idx)): continue if hasattr(memory, 'location'): # joblib >= 0.12 if memory.location is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) else: cloned_transformer = clone(transformer) # Fit or load from cache the current transformer x_train = X X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.raw_steps_[step_idx] = (name, fitted_transformer) self.steps[step_idx] = (name, self._to_onnx(name, fitted_transformer, x_train)) return X
def _fit(self, X, y=None, **fit_params_steps): self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(pipeline._fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) for (step_idx, name, transformer) in self._iter(with_final=False, filter_passthrough=False, filter_resample=False): if transformer is None or transformer == "passthrough": with _print_elapsed_time("Pipeline", self._log_message(step_idx)): continue try: # joblib >= 0.12 mem = memory.location except AttributeError: mem = memory.cachedir finally: cloned_transformer = clone(transformer) if mem else transformer # Fit or load from cache the current transformer if hasattr(cloned_transformer, "transform") or hasattr( cloned_transformer, "fit_transform"): X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname="Pipeline", message=self._log_message(step_idx), **fit_params_steps[name], ) elif hasattr(cloned_transformer, "fit_resample"): X, y, fitted_transformer = fit_resample_one_cached( cloned_transformer, X, y, message_clsname="Pipeline", message=self._log_message(step_idx), **fit_params_steps[name], ) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) return X, y
def _fit(self, X, y=None, **fit_params): self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) fit_params_steps = dict((name, {}) for name, step in self.steps if step is not None) for pname, pval in six.iteritems(fit_params): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X yt = y for step_idx, (name, transformer) in enumerate(self.steps[:-1]): if transformer is None: pass else: if hasattr(memory, 'location'): # joblib >= 0.12 if memory.location is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) elif hasattr(memory, 'cachedir'): # joblib < 0.11 if memory.cachedir is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer if (hasattr(cloned_transformer, "transform") or hasattr(cloned_transformer, "fit_transform")): Xt, fitted_transformer = fit_transform_one_cached( cloned_transformer, None, Xt, yt, **fit_params_steps[name]) elif hasattr(cloned_transformer, "fit_resample"): Xt, yt, fitted_transformer = fit_resample_one_cached( cloned_transformer, Xt, yt, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator is None: return Xt, yt, {} return Xt, yt, fit_params_steps[self.steps[-1][0]]
def _fit(self, X, y=None, **fit_params): self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) fit_params_steps = dict( (name, {}) for name, step in self.steps if step is not None) for pname, pval in six.iteritems(fit_params): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X yt = y for step_idx, (name, transformer) in enumerate(self.steps[:-1]): if transformer is None: pass else: if hasattr(memory, 'location'): # joblib >= 0.12 if memory.location is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) elif hasattr(memory, 'cachedir'): # joblib < 0.11 if memory.cachedir is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer if (hasattr(cloned_transformer, "transform") or hasattr(cloned_transformer, "fit_transform")): Xt, fitted_transformer = fit_transform_one_cached( cloned_transformer, None, Xt, yt, **fit_params_steps[name]) elif hasattr(cloned_transformer, "fit_resample"): Xt, yt, fitted_transformer = fit_resample_one_cached( cloned_transformer, Xt, yt, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator is None: return Xt, yt, {} return Xt, yt, fit_params_steps[self.steps[-1][0]]
def _fit(self, X, y=None, **fit_params): # Get correct fit function as either with memory # caching, or just as is, if no cache loc passed. if self.cache_loc is not None: memory = check_memory(self.cache_loc) _fit_estimator_c = memory.cache(_fit_estimator) else: _fit_estimator_c = _fit_estimator # Fit the estimator self.estimator_ = _fit_estimator_c(estimator=self.estimator_, X=X[:, self.inds_], y=y, **fit_params)
def fit_transform(self, X, y=None, mapping=None, **fit_params): if mapping is None: mapping = {} self._proc_mapping(mapping) inds = self.wrapper_inds_ self.rest_inds_ = [i for i in range(X.shape[1]) if i not in inds] # Before fit, need to handle annoying categorical encoders case # where there is no default setting to set to all cols # It shouldn't hurt to set these for other transformers (hopefully...) self.wrapper_transformer_ = clone(self.wrapper_transformer) self.wrapper_transformer_.cols = [i for i in range(len(inds))] self.wrapper_transformer_.return_df = False if self.cache_loc is not None: memory = check_memory(self.cache_loc) _fit_transform_single_transformer_c =\ memory.cache(_fit_transform_single_transformer) else: _fit_transform_single_transformer_c =\ _fit_transform_single_transformer self.wrapper_transformer_, X_trans =\ _fit_transform_single_transformer_c( transformer=self.wrapper_transformer_, X=X[:, inds], y=y) self._X_trans_inds = [i for i in range(X_trans.shape[1])] new_mapping = {} # Many to Many case for i in inds: new_mapping[i] = self._X_trans_inds for cnt in range(len(self.rest_inds_)): new_mapping[self.rest_inds_[cnt]] = len(self._X_trans_inds) + cnt self._out_mapping = new_mapping.copy() # Update mapping update_mapping(mapping, new_mapping) return np.hstack([X_trans, X[:, self.rest_inds_]])
def _fit(self, X, y=None, **fit_params): '''Override this function from ScopeObjs parent class almost exactly as is, but passing Xs instead of X.''' # Get correct fit function as either with memory # caching, or just as is, if no cache loc passed. if self.cache_loc is not None: memory = check_memory(self.cache_loc) _fit_estimator_c = memory.cache(_fit_estimator) else: _fit_estimator_c = _fit_estimator # Fit the estimator self.estimator_ =\ _fit_estimator_c(estimator=self.estimator_, Xs=[X[:, inds] for inds in self.view_inds_], y=y, **fit_params)
def memory(self): # When no log callback function is given, change nothing. # Or, if the memory cache was changed, set it back to its original. if self._log_callback is None: if hasattr(self._memory, '_cache'): self._memory.cache = self._memory._cache return self._memory self._memory = check_memory(self._memory) # Overwrite cache function of memory such that it logs the # output when the function is called if not hasattr(self._memory, '_cache'): self._memory._cache = self._memory.cache self._memory.cache = _cache_with_function_log_statement( self._log_callback).__get__(self._memory, self._memory.__class__) return self._memory
def test_check_memory(): memory = check_memory("cache_directory") assert memory.cachedir == os.path.join('cache_directory', 'joblib') memory = check_memory(None) assert memory.cachedir is None dummy = DummyMemory() memory = check_memory(dummy) assert memory is dummy msg = "'memory' should be None, a string or have the same interface as" \ " joblib.Memory. Got memory='1' instead." with pytest.raises(ValueError, match=msg): check_memory(1) dummy = WrongDummyMemory() msg = "'memory' should be None, a string or have the same interface as" \ " joblib.Memory. Got memory='{}' instead.".format(dummy) with pytest.raises(ValueError, match=msg): check_memory(dummy)
def _get_trans_col(self, fm_keys): # Grab the right data files from the file mapping (casting to int!) try: data_files = [self.file_mapping[int(fm_key)] for fm_key in fm_keys] # Add error about if NaN found except ValueError: raise ValueError( 'NaN error trying to load DataFile, make sure no missing DataFiles!' ) # Clone the base loader cloned_estimator = clone(self.estimator) # If a caching location is passed, create new load_and_trans_c func if self.cache_loc is not None: memory = check_memory(self.cache_loc) load_and_trans_c = memory.cache(load_and_trans) else: load_and_trans_c = load_and_trans if self._n_jobs == 1: X_trans_cols = get_trans_chunk(cloned_estimator, data_files, load_and_trans_c) else: chunks = self.get_chunks(data_files) X_trans_chunks =\ Parallel(n_jobs=self._n_jobs)( delayed(get_trans_chunk)( transformer=cloned_estimator, data_files=chunk, func=load_and_trans_c) for chunk in chunks) X_trans_cols = [] for chunk in X_trans_chunks: X_trans_cols += chunk return X_trans_cols
def _fit(self, X, y=None, **fit_params): # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_blend_one_cached = memory.cache(self._fit_blend_one) fit_params_steps = dict( (name, {}) for name, step in self.steps if step is not None) for pname, pval in six.iteritems(fit_params): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X indexes = np.arange(X.shape[0]) for step_idx, (name, transformer) in enumerate(self.steps[:-1]): if transformer is None: pass else: if hasattr(memory, 'cachedir') and memory.cachedir is None: # we do not clone when caching is disabled to preserve # backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer Xt, fitted_transformer, indexes = fit_blend_one_cached( cloned_transformer, Xt, y[indexes], None, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator is None: return Xt, {}, indexes return Xt, fit_params_steps[self.steps[-1][0]], indexes
def transform(self, X, y=None): memory = check_memory(self.memory) _transform_cached = memory.cache(self._transform) return _transform_cached(X, y)
def _fit(self, X, y=None, **fit_params_steps): # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) for (step_idx, name, transformer) in self._iter(with_final=False, filter_passthrough=False): if (transformer is None or transformer == 'passthrough'): with _print_elapsed_time('Pipeline', self._log_message(step_idx)): continue if hasattr(memory, 'location'): # joblib >= 0.12 if memory.location is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) elif hasattr(memory, 'cachedir'): # joblib < 0.11 if memory.cachedir is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) else: cloned_transformer = clone(transformer) if not self._skip_transform(cloned_transformer): # if cloned_transformer. if isinstance(cloned_transformer, YTransformer): y, fitted_transformer = fit_transform_one_cached( cloned_transformer, y, X, None, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name]) elif isinstance(cloned_transformer, XAndYTransformer) \ or isinstance(cloned_transformer, XOrYTransformer): X, y, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name]) else: # Fit or load from cache the current transformer X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name]) else: # do nothing if it is not trainmode and the trainonly wrapper set (true) fitted_transformer = cloned_transformer # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) return X, y
def fit(self, X, y=None): """Fit the hierarchical clustering on the data Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data. Shape [n_samples, n_features], or [n_samples, n_samples] if affinity=='precomputed'. y : Ignored Returns ------- self """ if (self.pooling_func != 'deprecated' and not isinstance(self, AgglomerationTransform)): warnings.warn( 'Agglomerative "pooling_func" parameter is not used.' ' It has been deprecated in version 0.20 and will be' 'removed in 0.22', DeprecationWarning) X = check_array(X, ensure_min_samples=2, estimator=self) memory = check_memory(self.memory) if self.n_clusters is not None and self.n_clusters <= 0: raise ValueError("n_clusters should be an integer greater than 0." " %s was provided." % str(self.n_clusters)) if not ((self.n_clusters is None) ^ (self.distance_threshold is None)): raise ValueError("Exactly one of n_clusters and " "distance_threshold has to be set, and the other " "needs to be None.") if (self.distance_threshold is not None and not self.compute_full_tree): raise ValueError("compute_full_tree must be True if " "distance_threshold is set.") if self.linkage == "ward" and self.affinity != "euclidean": raise ValueError("%s was provided as affinity. Ward can only " "work with euclidean distances." % (self.affinity, )) if self.linkage not in _TREE_BUILDERS: raise ValueError("Unknown linkage type %s. " "Valid options are %s" % (self.linkage, _TREE_BUILDERS.keys())) tree_builder = _TREE_BUILDERS[self.linkage] connectivity = self.connectivity if self.connectivity is not None: if callable(self.connectivity): connectivity = self.connectivity(X) connectivity = check_array(connectivity, accept_sparse=['csr', 'coo', 'lil']) n_samples = len(X) compute_full_tree = self.compute_full_tree if self.connectivity is None: compute_full_tree = True if compute_full_tree == 'auto': if self.distance_threshold is not None: compute_full_tree = True else: # Early stopping is likely to give a speed up only for # a large number of clusters. The actual threshold # implemented here is heuristic compute_full_tree = self.n_clusters < max(100, .02 * n_samples) n_clusters = self.n_clusters if compute_full_tree: n_clusters = None # Construct the tree kwargs = {} if self.linkage != 'ward': kwargs['linkage'] = self.linkage kwargs['affinity'] = self.affinity distance_threshold = self.distance_threshold return_distance = distance_threshold is not None out = memory.cache(tree_builder)(X, connectivity, n_clusters=n_clusters, return_distance=return_distance, **kwargs) (self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[:4] if distance_threshold is not None: distances = out[-1] self.distances_ = distances self.n_clusters_ = np.count_nonzero( distances >= distance_threshold) + 1 else: self.n_clusters_ = self.n_clusters # Cut the tree if compute_full_tree: self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_) else: labels = _hierarchical.hc_get_heads(parents, copy=False) # copy to avoid holding a reference on the original array labels = np.copy(labels[:n_samples]) # Reassign cluster numbers self.labels_ = np.searchsorted(np.unique(labels), labels) return self
def _fit(self, X, y=None, **fit_params_steps): self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(skpipeline._fit_transform_one) conf_score = None for (step_idx, name, transformer) in self._iter(with_final=False, filter_passthrough=False): if transformer is None or transformer == 'passthrough': with _print_elapsed_time('Pipeline', self._log_message(step_idx)): continue if hasattr(memory, 'location'): # joblib >= 0.12 if memory.location is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) elif hasattr(memory, 'cachedir'): # joblib < 0.11 if memory.cachedir is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) else: cloned_transformer = clone(transformer) # Fit or load from cache the current transformer if hasattr(cloned_transformer, "transform") or hasattr( cloned_transformer, "fit_transform" ): res, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name] ) # This ugly if/else can be removed if Transformers return # additional values (i.e. `conf_score`) in dict. Can be # appended to `fit_params_steps` dict. if type(res) == tuple: if len(res) == 3: X, y, conf_score = res elif len(res) == 2: X, y = res else: X = res # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) return X, y, conf_score
def _fit(self, X, y=None, **fit_params): self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(pipeline._fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) fit_params_steps = { name: {} for name, step in self.steps if step is not None } for pname, pval in fit_params.items(): if '__' not in pname: raise ValueError( "Pipeline.fit does not accept the {} parameter. " "You can pass parameters to specific steps of your " "pipeline using the stepname__parameter format, e.g. " "`Pipeline.fit(X, y, logisticregression__sample_weight" "=sample_weight)`.".format(pname)) step, param = pname.split("__", 1) fit_params_steps[step][param] = pval for (step_idx, name, transformer) in self._iter(with_final=False, filter_passthrough=False, filter_resample=False): if (transformer is None or transformer == 'passthrough'): with _print_elapsed_time('Pipeline', self._log_message(step_idx)): continue if hasattr(memory, "location"): # joblib >= 0.12 if memory.location is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) elif hasattr(memory, "cachedir"): # joblib <= 0.11 if memory.cachedir is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer if hasattr(cloned_transformer, "transform") or hasattr( cloned_transformer, "fit_transform"): X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name]) elif hasattr(cloned_transformer, "fit_resample"): X, y, fitted_transformer = fit_resample_one_cached( cloned_transformer, X, y, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator == "passthrough": return X, y, {} return X, y, fit_params_steps[self.steps[-1][0]]
def transform(self, X, y=None): memory = check_memory(self.memory) _transform_cached = memory.cache(self._transform) return _transform_cached(X, y)
def _fit(self, X_train, y_train, X_valid=None, y_valid=None, X_test=None, y_test=None): # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) for (step_idx, name, transformer) in self._iter(with_final=False, filter_passthrough=False): if (transformer is None or transformer == 'passthrough'): continue if hasattr(memory, 'location'): # joblib >= 0.12 if memory.location is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) elif hasattr(memory, 'cachedir'): # joblib < 0.11 if memory.cachedir is None: # we do not clone when caching is disabled to # preserve backward compatibility cloned_transformer = transformer else: cloned_transformer = clone(transformer) else: cloned_transformer = clone(transformer) # Fit or load from cache the current transformer result, fitted_transformer = fit_transform_one_cached( cloned_transformer, X_train, y_train, X_valid, y_valid, X_test, y_test, self.resource_manager, message_clsname='Pipeline', message=self._log_message(step_idx)) X_train = result["X_train"] X_valid = result.get("X_valid") X_test = result.get("X_test") y_train = result.get("y_train") # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) return { "X_train": X_train, "X_valid": X_valid, "X_test": X_test, "y_train": y_train }
def _fit(self, X, y=None, **fit_params): self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(pipeline._fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) fit_params_steps = {name: {} for name, step in self.steps if step is not None} for pname, pval in fit_params.items(): if "__" not in pname: raise ValueError( f"Pipeline.fit does not accept the {pname} parameter. " "You can pass parameters to specific steps of your " "pipeline using the stepname__parameter format, e.g. " "`Pipeline.fit(X, y, logisticregression__sample_weight" "=sample_weight)`." ) step, param = pname.split("__", 1) fit_params_steps[step][param] = pval for (step_idx, name, transformer) in self._iter( with_final=False, filter_passthrough=False, filter_resample=False ): if transformer is None or transformer == "passthrough": with _print_elapsed_time("Pipeline", self._log_message(step_idx)): continue try: # joblib >= 0.12 mem = memory.location except AttributeError: mem = memory.cachedir finally: cloned_transformer = clone(transformer) if mem else transformer # Fit or load from cache the current transformer if hasattr(cloned_transformer, "transform") or hasattr( cloned_transformer, "fit_transform" ): X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, None, message_clsname="Pipeline", message=self._log_message(step_idx), **fit_params_steps[name], ) elif hasattr(cloned_transformer, "fit_resample"): X, y, fitted_transformer = fit_resample_one_cached( cloned_transformer, X, y, message_clsname="Pipeline", message=self._log_message(step_idx), **fit_params_steps[name], ) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator == "passthrough": return X, y, {} return X, y, fit_params_steps[self.steps[-1][0]]