def middle_transformations(self, est, X, y): if len(est.steps) > 2: tmp = Pipeline([(name, obj) for name, obj in est.steps[1:-1]] + [("dummy", DummyClassifier())]) transformed_data, fit_params = tmp._pre_transform(X, y) return transformed_data else: return X
def middle_transformations(self, est, X, y): if len(est.steps) > 2: tmp = Pipeline([(name, obj) for name, obj in est.steps[1:-1]] + [("dummy", DummyClassifier())]) transformed_data, fit_params = tmp._pre_transform(X, y) return transformed_data else: return X
class BasePipeline(BaseEstimator): """Base class for all pipeline objects. Notes ----- This class should not be instantiated, only subclassed.""" __metaclass__ = ABCMeta def __init__(self, configuration, random_state=None): self.configuration = configuration if random_state is None: self.random_state = check_random_state(1) else: self.random_state = check_random_state(random_state) def fit(self, X, y, fit_params=None, init_params=None): """Fit the selected algorithm to the training data. Parameters ---------- X : array-like or sparse, shape = (n_samples, n_features) Training data. The preferred type of the matrix (dense or sparse) depends on the estimator selected. y : array-like Targets fit_params : dict See the documentation of sklearn.pipeline.Pipeline for formatting instructions. init_params : dict Pass arguments to the constructors of single methods. To pass arguments to only one of the methods (lets says the OneHotEncoder), seperate the class name from the argument by a ':'. Returns ------- self : returns an instance of self. Raises ------ NoModelException NoModelException is raised if fit() is called without specifying a classification algorithm first. """ X, fit_params = self.pre_transform(X, y, fit_params=fit_params, init_params=init_params) self.fit_estimator(X, y, fit_params=fit_params) return self def pre_transform(self, X, y, fit_params=None, init_params=None): # Save all transformation object in a list to create a pipeline object steps = [] # seperate the init parameters for the single methods init_params_per_method = defaultdict(dict) if init_params is not None and len(init_params) != 0: for init_param, value in init_params.items(): method, param = init_param.split(":") init_params_per_method[method][param] = value # List of preprocessing steps (and their order) preprocessors_names = [preprocessor[0] for preprocessor in self._get_pipeline()[:-1]] for preproc_name in preprocessors_names: preproc_params = {} for instantiated_hyperparameter in self.configuration: if not instantiated_hyperparameter.startswith( preproc_name + ":"): continue if self.configuration[instantiated_hyperparameter] is None: continue name_ = instantiated_hyperparameter.split(":")[-1] preproc_params[name_] = self.configuration[ instantiated_hyperparameter] if preproc_name in \ components.feature_preprocessing_components._preprocessors: _preprocessors = components.feature_preprocessing_components._preprocessors elif preproc_name in \ components.data_preprocessing_components._preprocessors: _preprocessors = components.data_preprocessing_components._preprocessors else: raise ValueError(preproc_name) preprocessor_object = _preprocessors[preproc_name]( random_state=self.random_state, **preproc_params) # Ducktyping... if hasattr(preprocessor_object, 'get_components'): preprocessor_object = preprocessor_object.choice steps.append((preproc_name, preprocessor_object)) # Extract Estimator Hyperparameters from the configuration object estimator_name = self._get_pipeline()[-1][0] estimator_object = self._get_pipeline()[-1][1] estimator_parameters = {} for instantiated_hyperparameter in self.configuration: if not instantiated_hyperparameter.startswith(estimator_name): continue if self.configuration[instantiated_hyperparameter] is None: continue name_ = instantiated_hyperparameter.split(":")[-1] estimator_parameters[name_] = self.configuration[ instantiated_hyperparameter] estimator_parameters.update(init_params_per_method[estimator_name]) estimator_object = estimator_object(random_state=self.random_state, **estimator_parameters) # Ducktyping... if hasattr(estimator_object, 'get_components'): estimator_object = estimator_object.choice steps.append((estimator_name, estimator_object)) self.pipeline_ = Pipeline(steps) if fit_params is None or not isinstance(fit_params, dict): fit_params = dict() else: fit_params = {key.replace(":", "__"): value for key, value in fit_params.items()} X, fit_params = self.pipeline_._pre_transform(X, y, **fit_params) return X, fit_params def fit_estimator(self, X, y, fit_params=None): check_is_fitted(self, 'pipeline_') if fit_params is None: fit_params = {} self.pipeline_.steps[-1][-1].fit(X, y, **fit_params) return self def iterative_fit(self, X, y, fit_params=None, n_iter=1): check_is_fitted(self, 'pipeline_') if fit_params is None: fit_params = {} self.pipeline_.steps[-1][-1].iterative_fit(X, y, n_iter=n_iter, **fit_params) def estimator_supports_iterative_fit(self): return hasattr(self.pipeline_.steps[-1][-1], 'iterative_fit') def configuration_fully_fitted(self): check_is_fitted(self, 'pipeline_') return self.pipeline_.steps[-1][-1].configuration_fully_fitted() def predict(self, X, batch_size=None): """Predict the classes using the selected model. Parameters ---------- X : array-like, shape = (n_samples, n_features) batch_size: int or None, defaults to None batch_size controls whether the pipeline will be called on small chunks of the data. Useful when calling the predict method on the whole array X results in a MemoryError. Returns ------- array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) Returns the predicted values""" # TODO check if fit() was called before... if batch_size is None: return self.pipeline_.predict(X) else: if type(batch_size) is not int or batch_size <= 0: raise Exception("batch_size must be a positive integer") else: if self.num_targets == 1: y = np.zeros((X.shape[0],)) else: y = np.zeros((X.shape[0], self.num_targets)) # Copied and adapted from the scikit-learn GP code for k in range(max(1, int(np.ceil(float(X.shape[0]) / batch_size)))): batch_from = k * batch_size batch_to = min([(k + 1) * batch_size, X.shape[0]]) y[batch_from:batch_to] = \ self.predict(X[batch_from:batch_to], batch_size=None) return y @classmethod def get_hyperparameter_search_space(cls, include=None, exclude=None, dataset_properties=None): """Return the configuration space for the CASH problem. This method should be called by the method get_hyperparameter_search_space of a subclass. After the subclass assembles a list of available estimators and preprocessor components, _get_hyperparameter_search_space can be called to do the work of creating the actual HPOlibConfigSpace.configuration_space.ConfigurationSpace object. Parameters ---------- estimator_name : str Name of the estimator hyperparameter which will be used in the configuration space. For a classification task, this would be 'classifier'. estimator_components : dict {name: component} Dictionary with all estimator components to be included in the configuration space. preprocessor_components : dict {name: component} Dictionary with all preprocessor components to be included in the configuration space. . always_active : list of str A list of components which will always be active in the pipeline. This is useful for components like imputation which have hyperparameters to be configured, but which do not have any parent. default_estimator : str Default value for the estimator hyperparameter. Returns ------- cs : HPOlibConfigSpace.configuration_space.Configuration The configuration space describing the AutoSklearnClassifier. """ raise NotImplementedError() @classmethod def _get_hyperparameter_search_space(cls, cs, dataset_properties, exclude, include, pipeline): if include is None: include = {} keys = [pair[0] for pair in pipeline] for key in include: if key not in keys: raise ValueError('Invalid key in include: %s; should be one ' 'of %s' % (key, keys)) if exclude is None: exclude = {} keys = [pair[0] for pair in pipeline] for key in exclude: if key not in keys: raise ValueError('Invalid key in exclude: %s; should be one ' 'of %s' % (key, keys)) if 'sparse' not in dataset_properties: # This dataset is probaby dense dataset_properties['sparse'] = False if 'signed' not in dataset_properties: # This dataset probably contains unsigned data dataset_properties['signed'] = False matches = autosklearn.pipeline.create_searchspace_util.get_match_array( pipeline, dataset_properties, include=include, exclude=exclude) # Now we have only legal combinations at this step of the pipeline # Simple sanity checks assert np.sum(matches) != 0, "No valid pipeline found." assert np.sum(matches) <= np.size(matches), \ "'matches' is not binary; %s <= %d, %s" % \ (str(np.sum(matches)), np.size(matches), str(matches.shape)) # Iterate each dimension of the matches array (each step of the # pipeline) to see if we can add a hyperparameter for that step for node_idx, n_ in enumerate(pipeline): node_name, node = n_ is_choice = hasattr(node, "get_available_components") # if the node isn't a choice we can add it immediately because it # must be active (if it wouldn't, np.sum(matches) would be zero if not is_choice: cs.add_configuration_space(node_name, node.get_hyperparameter_search_space(dataset_properties)) # If the node isn't a choice, we have to figure out which of it's # choices are actually legal choices else: choices_list = autosklearn.pipeline.create_searchspace_util.\ find_active_choices(matches, node, node_idx, dataset_properties, include.get(node_name), exclude.get(node_name)) cs.add_configuration_space(node_name, node.get_hyperparameter_search_space( dataset_properties, include=choices_list)) # And now add forbidden parameter configurations # According to matches if np.sum(matches) < np.size(matches): cs = autosklearn.pipeline.create_searchspace_util.add_forbidden( conf_space=cs, pipeline=pipeline, matches=matches, dataset_properties=dataset_properties, include=include, exclude=exclude) return cs def __repr__(self): class_name = self.__class__.__name__ configuration = {} self.configuration._populate_values() for hp_name in self.configuration: if self.configuration[hp_name] is not None: configuration[hp_name] = self.configuration[hp_name] configuration_string = ''.join( ['configuration={\n ', ',\n '.join(["'%s': %s" % (hp_name, repr(configuration[hp_name])) for hp_name in sorted(configuration)]), '}']) return '%s(%s)' % (class_name, configuration_string) @classmethod def _get_pipeline(cls): if cls == autosklearn.pipelineBaseEstimator: return [] raise NotImplementedError() def _get_estimator_hyperparameter_name(self): raise NotImplementedError()
class EncodingModel(object): def __init__(self, delays=None, est=None, scorer=None, preproc_y=True): """Fit a STRF model. Fit a receptive field using time lags and a custom estimator or pipeline. This implementation uses Ridge regression and scikit-learn. It creates time lags for the input matrix, then does cross validation to fit a STRF model. Parameters ---------- delays : array, shape (n_delays,) The delays to include when creating time lags. The input array X will end up having shape (n_feats * n_delays, n_times) est : list instance of sklearn estimator | pipeline with estimator The estimator to use for fitting. This may be a pipeline, in which case the final estimator must create a `coef_` attribute after fitting. If an estimator is passed, it also must produce a `coef_` attribute after fitting. If estimator is type `GridSearchCV`, then a grid search will be performed on each CV iteration (using the cv object stored in GridSearchCV). Extra attributes will be generated. (see `fit` documentation) scorer : function | None The scorer to use when evaluating on the held-out test set. It must accept two 1-d arrays as inputs (the true values first, and predicted values second), and output a scalar value. If None, it will be mean squared error. preproc_y : bool Whether to apply the preprocessing steps of the estimator used in fitting on the predictor variables prior to model fitting. """ self.delays = np.array([0]) if delays is None else delays self.n_delays = len(self.delays) self.est = Ridge() if est is None else est self.scorer = mean_squared_error if scorer is None else scorer self.preproc_y = preproc_y def fit(self, X, y, sfreq, times=None, tmin=None, tmax=None, cv=None, preproc_y=False, cv_params=None, feat_names=None, verbose=False): """Fit the model. Fits a receptive field model. Model results are stored as attributes. Parameters ---------- X : array, shape (n_epochs, n_feats, n_times) The input data for the regression y : array, shape (n_epochs, n_times,) The output data for the regression sfreq : float The sampling frequency for the time dimension times : array, shape (n_times,) The times corresponding to the final axis of x/y. Is used to specify subsets of time per trial (using tmin/tmax) tmin : float | array, shape (n_epochs,) The beginning time for each epoch. Optionally a different time for each epoch may be provided. tmax : float | array, shape (n_epochs,) The end time for each epoch. Optionally a different time for each epoch may be provided. cv : int | instance of (KFold, LabelShuffleSplit) The cross validation object to use for the outer loop feat_names : list of strings/ints/floats, shape (n_feats,) : None A list of values corresponding to input features. Useful for keeping track of the coefficients in the model after time lagging. verbose : bool If True, will display a progress bar during fits for CVs remaining. Attributes ---------- coef_ : array, shape (n_features * n_lags) The average coefficients across CV splits coefs_all_ : array, shape(n_cv, n_features * n_lags) The raw coefficients for each iteration of cross-validation. coef_names : array, shape (n_features * n_lags, 2) A list of coefficient names, useful for keeping track of time lags scores_ : array, shape (n_cv,) Prediction scores for each cross-validation split on the held-out test set. Scores are outputs of the `scorer` attribute function. best_estimators_ : list of estimators, shape (n_cv,) If initial estimator is type `GridSearchCV`, this is the list of chosen estimators on each cv split. best_params_ : list of dicts, shape (n_cv,) If initial estimator is type `GridSearchCV`, this is the list of chosen parameters on each cv split. """ if feat_names is not None: if len(feat_names) != X.shape[1]: raise ValueError( 'feat_names and X.shape[0] must be the same size') if times is None: times = np.arange(X.shape[-1]) / float(sfreq) self.tmin = times[0] if tmin is None else tmin self.tmax = times[-1] if tmax is None else tmax self.times = times self.sfreq = sfreq # Delay X X, y, labels, names = _build_design_matrix(X, y, sfreq, self.times, self.delays, self.tmin, self.tmax, feat_names) self.feat_names = np.array(names) cv = _check_cv(X, labels, cv, cv_params) # Define names for input variabels to keep track of time delays X_names = [(feat, delay) for delay in self.delays for feat in self.feat_names] self.coef_names = np.array(X_names) # Build model instance if not isinstance(self.est, Pipeline): self.est = Pipeline([('est', self.est)]) # Create model metadata that we'll add to the obj later model_data = dict(coefs_all_=[], scores_=[]) if isinstance(self.est.steps[-1][-1], GridSearchCV): model_data.update(dict(best_estimators_=[], best_params_=[])) # Fit the model and collect model results if verbose is True: cv = tqdm(cv) for i, (tr, tt) in enumerate(cv): X_tr = X[:, tr].T X_tt = X[:, tt].T y_tr = y[tr] y_tt = y[tt] if self.preproc_y: y_tr, y_tt = [self.est._pre_transform(i)[0] for i in [y_tr, y_tt]] self.est.fit(X_tr, y_tr) mod = deepcopy(self.est.steps[-1][-1]) if isinstance(mod, GridSearchCV): # If it's a GridSearch, then add a "best_params" object # Assume hyperparameter search if mod.refit: model_data['best_estimators_'].append(mod.best_estimator_) model_data['coefs_all_'].append(mod.best_estimator_.coef_) model_data['best_params_'].append(mod.best_params_) else: model_data['coefs_all_'].append(mod.coef_) # Fit model + make predictions scr = self.scorer(y_tt, self.est.predict(X_tt)) model_data['scores_'].append(scr) for key, val in model_data.iteritems(): setattr(self, key, np.array(val)) self.coefs_ = np.mean(self.coefs_all_, axis=0) self.cv = cv def predict(self, X): """Generate predictions using a fit receptive field model. This uses the `coef_` attribute for predictions. """ X_lag = delay_timeseries(X, self.sfreq, self.delays) Xt = self.est._pre_transform(X_lag.T)[0] return np.dot(Xt, self.coefs_) def coefs_as_series(self, agg=None): """Return the raw coefficients as a pandas series. Parameters ---------- agg : None | function If agg is None, all coefs across CVs will be returned. If it is a function, it will be applied across CVs and the output will be shape (n_coefficients,). Outputs ------- sr : pandas Series, shape (n_coefficients,) | (n_cv * n_coefficients) The coefficients as a pandas series object. """ ix = pd.MultiIndex.from_tuples(self.coef_names, names=['feat', 'lag']) if agg is None: sr = [] for icv, icoef in enumerate(self.coefs_all_): isr = pd.DataFrame(icoef[:, np.newaxis], index=ix) isr['cv'] = icv isr = isr.set_index('cv', append=True).squeeze() sr.append(isr) sr = pd.concat(sr, axis=0) else: coefs = agg(self.coefs_all_, axis=0) sr = pd.Series(coefs, index=ix) return sr def plot_coefficients(self, agg=None, ax=None, cmap=None, interpolation='nearest', aspect='auto', **kwargs): """Plot the coefficients as a 2D heatmap. The plot will be shape (n_features, n_lags) """ from matplotlib import pyplot as plt cmap = plt.cm.RdBu_r if cmap is None else cmap agg = np.mean if agg is None else agg if ax is None: f, ax = plt.subplots() df = self.coefs_as_series(agg=agg).unstack('lag') im = ax.imshow(df.values, cmap=cmap, interpolation=interpolation, aspect=aspect, **kwargs) for lab in ax.get_xticklabels(): lab.set_text(df.columns[int(lab.get_position()[0])]) for lab in ax.get_yticklabels(): lab.set_text(df.index[int(lab.get_position()[1])]) ax.set_xlabel('Time delays (s)') ax.set_ylabel('Features') return ax
class EncodingModel(object): def __init__(self, delays=None, est=None, scorer=None, preproc_y=True): """Fit a STRF model. Fit a receptive field using time lags and a custom estimator or pipeline. This implementation uses Ridge regression and scikit-learn. It creates time lags for the input matrix, then does cross validation to fit a STRF model. Parameters ---------- delays : array, shape (n_delays,) The delays to include when creating time lags. The input array X will end up having shape (n_feats * n_delays, n_times) est : list instance of sklearn estimator | pipeline with estimator The estimator to use for fitting. This may be a pipeline, in which case the final estimator must create a `coef_` attribute after fitting. If an estimator is passed, it also must produce a `coef_` attribute after fitting. If estimator is type `GridSearchCV`, then a grid search will be performed on each CV iteration (using the cv object stored in GridSearchCV). Extra attributes will be generated. (see `fit` documentation) scorer : function | None The scorer to use when evaluating on the held-out test set. It must accept two 1-d arrays as inputs (the true values first, and predicted values second), and output a scalar value. If None, it will be mean squared error. preproc_y : bool Whether to apply the preprocessing steps of the estimator used in fitting on the predictor variables prior to model fitting. References ---------- [1] Theunissen, F. E. et al. Estimating spatio-temporal receptive fields of auditory and visual neurons from their responses to natural stimuli. Network 12, 289–316 (2001). [2] Willmore, B. & Smyth, D. Methods for first-order kernel estimation: simple-cell receptive fields from responses to natural scenes. Network 14, 553–77 (2003). """ self.delays = np.array([0]) if delays is None else delays self.n_delays = len(self.delays) self.est = Ridge() if est is None else est self.scorer = mean_squared_error if scorer is None else scorer self.preproc_y = preproc_y def fit(self, X, y, sfreq, times=None, tmin=None, tmax=None, cv=None, cv_params=None, feat_names=None, verbose=False): """Fit the model. Fits a receptive field model. Model results are stored as attributes. Parameters ---------- X : array, shape (n_epochs, n_feats, n_times) The input data for the regression y : array, shape (n_epochs, n_times,) The output data for the regression sfreq : float The sampling frequency for the time dimension times : array, shape (n_times,) The times corresponding to the final axis of x/y. Is used to specify subsets of time per trial (using tmin/tmax) tmin : float | array, shape (n_epochs,) The beginning time for each epoch. Optionally a different time for each epoch may be provided. tmax : float | array, shape (n_epochs,) The end time for each epoch. Optionally a different time for each epoch may be provided. cv : int | instance of (KFold, LabelShuffleSplit) The cross validation object to use for the outer loop feat_names : list of strings/ints/floats, shape (n_feats,) : None A list of values corresponding to input features. Useful for keeping track of the coefficients in the model after time lagging. verbose : bool If True, will display a progress bar during fits for CVs remaining. Attributes ---------- coef_ : array, shape (n_features * n_lags) The average coefficients across CV splits coefs_all_ : array, shape(n_cv, n_features * n_lags) The raw coefficients for each iteration of cross-validation. coef_names : array, shape (n_features * n_lags, 2) A list of coefficient names, useful for keeping track of time lags scores_ : array, shape (n_cv,) Prediction scores for each cross-validation split on the held-out test set. Scores are outputs of the `scorer` attribute function. best_estimators_ : list of estimators, shape (n_cv,) If initial estimator is type `GridSearchCV`, this is the list of chosen estimators on each cv split. best_params_ : list of dicts, shape (n_cv,) If initial estimator is type `GridSearchCV`, this is the list of chosen parameters on each cv split. """ if feat_names is not None: if len(feat_names) != X.shape[1]: raise ValueError( 'feat_names and X.shape[0] must be the same size') if times is None: times = np.arange(X.shape[-1]) / float(sfreq) self.tmin = times[0] if tmin is None else tmin self.tmax = times[-1] if tmax is None else tmax self.times = times self.sfreq = sfreq # Delay X X, y, labels, names = _build_design_matrix(X, y, sfreq, self.times, self.delays, self.tmin, self.tmax, feat_names) self.feat_names = np.array(names) cv = _check_cv(X, labels, cv, cv_params) # Define names for input variabels to keep track of time delays X_names = [(feat, delay) for delay in self.delays for feat in self.feat_names] self.coef_names = np.array(X_names) # Build model instance if not isinstance(self.est, Pipeline): self.est = Pipeline([('est', self.est)]) # Create model metadata that we'll add to the obj later model_data = dict(coefs_all_=[], scores_=[]) if isinstance(self.est.steps[-1][-1], GridSearchCV): model_data.update(dict(best_estimators_=[], best_params_=[])) # Fit the model and collect model results if verbose is True: cv = tqdm(cv) for i, (tr, tt) in enumerate(cv): X_tr = X[:, tr].T X_tt = X[:, tt].T y_tr = y[tr, np.newaxis] y_tt = y[tt, np.newaxis] if self.preproc_y: y_tr, y_tt = [ self.est._pre_transform(i)[0] for i in [y_tr, y_tt] ] self.est.fit(X_tr, y_tr) mod = deepcopy(self.est.steps[-1][-1]) if isinstance(mod, GridSearchCV): # If it's a GridSearch, then add a "best_params" object # Assume hyperparameter search if mod.refit: model_data['best_estimators_'].append(mod.best_estimator_) model_data['coefs_all_'].append(mod.best_estimator_.coef_) model_data['best_params_'].append(mod.best_params_) else: model_data['coefs_all_'].append(mod.coef_) # Fit model + make predictions scr = self.scorer(y_tt, self.est.predict(X_tt)) model_data['scores_'].append(scr) for key, val in model_data.iteritems(): setattr(self, key, np.array(val)) self.coefs_ = np.mean(self.coefs_all_, axis=0) self.cv = cv def predict(self, X): """Generate predictions using a fit receptive field model. This uses the `coef_` attribute for predictions. """ X_lag = delay_timeseries(X, self.sfreq, self.delays) Xt = self.est._pre_transform(X_lag.T)[0] return np.dot(Xt, self.coefs_) def coefs_as_series(self, agg=None): """Return the raw coefficients as a pandas series. Parameters ---------- agg : None | function If agg is None, all coefs across CVs will be returned. If it is a function, it will be applied across CVs and the output will be shape (n_coefficients,). Outputs ------- sr : pandas Series, shape (n_coefficients,) | (n_cv * n_coefficients) The coefficients as a pandas series object. """ ix = pd.MultiIndex.from_tuples(self.coef_names, names=['feat', 'lag']) if agg is None: sr = [] for icv, icoef in enumerate(self.coefs_all_): isr = pd.DataFrame(icoef[:, np.newaxis], index=ix) isr['cv'] = icv isr = isr.set_index('cv', append=True).squeeze() sr.append(isr) sr = pd.concat(sr, axis=0) else: coefs = agg(self.coefs_all_, axis=0) sr = pd.Series(coefs, index=ix) return sr def plot_coefficients(self, agg=None, ax=None, cmap=None, interpolation='nearest', aspect='auto', **kwargs): """Plot the coefficients as a 2D heatmap. The plot will be shape (n_features, n_lags) """ from matplotlib import pyplot as plt cmap = plt.cm.RdBu_r if cmap is None else cmap agg = np.mean if agg is None else agg if ax is None: f, ax = plt.subplots() df = self.coefs_as_series(agg=agg).unstack('lag') im = ax.imshow(df.values, cmap=cmap, interpolation=interpolation, aspect=aspect, **kwargs) for lab in ax.get_xticklabels(): lab.set_text(df.columns[int(lab.get_position()[0])]) for lab in ax.get_yticklabels(): lab.set_text(df.index[int(lab.get_position()[1])]) ax.set_xlabel('Time delays (s)') ax.set_ylabel('Features') return ax
class BasePipeline(BaseEstimator): """Base class for all pipeline objects. Notes ----- This class should not be instantiated, only subclassed.""" __metaclass__ = ABCMeta def __init__(self, configuration, random_state=None): self.configuration = configuration if random_state is None: self.random_state = check_random_state(1) else: self.random_state = check_random_state(random_state) def fit(self, X, y, fit_params=None, init_params=None): """Fit the selected algorithm to the training data. Parameters ---------- X : array-like or sparse, shape = (n_samples, n_features) Training data. The preferred type of the matrix (dense or sparse) depends on the estimator selected. y : array-like Targets fit_params : dict See the documentation of sklearn.pipeline.Pipeline for formatting instructions. init_params : dict Pass arguments to the constructors of single methods. To pass arguments to only one of the methods (lets says the OneHotEncoder), seperate the class name from the argument by a ':'. Returns ------- self : returns an instance of self. Raises ------ NoModelException NoModelException is raised if fit() is called without specifying a classification algorithm first. """ X, fit_params = self.pre_transform(X, y, fit_params=fit_params, init_params=init_params) self.fit_estimator(X, y, fit_params=fit_params) return self def pre_transform(self, X, y, fit_params=None, init_params=None): # Save all transformation object in a list to create a pipeline object steps = [] # seperate the init parameters for the single methods init_params_per_method = defaultdict(dict) if init_params is not None and len(init_params) != 0: for init_param, value in init_params.items(): method, param = init_param.split(":") init_params_per_method[method][param] = value # Instantiate preprocessor objects for preproc_name, preproc_class in self._get_pipeline()[:-1]: preproc_params = {} for instantiated_hyperparameter in self.configuration: if not instantiated_hyperparameter.startswith(preproc_name + ":"): continue if self.configuration[instantiated_hyperparameter] is None: continue name_ = instantiated_hyperparameter.split(":")[-1] preproc_params[name_] = self.configuration[ instantiated_hyperparameter] preprocessor_object = preproc_class(random_state=self.random_state, **preproc_params) # Ducktyping... if hasattr(preproc_class, 'get_components'): preprocessor_object = preprocessor_object.choice steps.append((preproc_name, preprocessor_object)) # Extract Estimator Hyperparameters from the configuration object estimator_name = self._get_pipeline()[-1][0] estimator_object = self._get_pipeline()[-1][1] estimator_parameters = {} for instantiated_hyperparameter in self.configuration: if not instantiated_hyperparameter.startswith(estimator_name): continue if self.configuration[instantiated_hyperparameter] is None: continue name_ = instantiated_hyperparameter.split(":")[-1] estimator_parameters[name_] = self.configuration[ instantiated_hyperparameter] estimator_parameters.update(init_params_per_method[estimator_name]) estimator_object = estimator_object(random_state=self.random_state, **estimator_parameters) # Ducktyping... if hasattr(estimator_object, 'get_components'): estimator_object = estimator_object.choice steps.append((estimator_name, estimator_object)) self.pipeline_ = Pipeline(steps) if fit_params is None or not isinstance(fit_params, dict): fit_params = dict() else: fit_params = { key.replace(":", "__"): value for key, value in fit_params.items() } X, fit_params = self.pipeline_._pre_transform(X, y, **fit_params) return X, fit_params def fit_estimator(self, X, y, fit_params=None): check_is_fitted(self, 'pipeline_') if fit_params is None: fit_params = {} self.pipeline_.steps[-1][-1].fit(X, y, **fit_params) return self def iterative_fit(self, X, y, fit_params=None, n_iter=1): check_is_fitted(self, 'pipeline_') if fit_params is None: fit_params = {} self.pipeline_.steps[-1][-1].iterative_fit(X, y, n_iter=n_iter, **fit_params) def estimator_supports_iterative_fit(self): return hasattr(self.pipeline_.steps[-1][-1], 'iterative_fit') def configuration_fully_fitted(self): check_is_fitted(self, 'pipeline_') return self.pipeline_.steps[-1][-1].configuration_fully_fitted() def predict(self, X, batch_size=None): """Predict the classes using the selected model. Parameters ---------- X : array-like, shape = (n_samples, n_features) batch_size: int or None, defaults to None batch_size controls whether the pipeline will be called on small chunks of the data. Useful when calling the predict method on the whole array X results in a MemoryError. Returns ------- array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) Returns the predicted values""" # TODO check if fit() was called before... if batch_size is None: return self.pipeline_.predict(X).astype(self._output_dtype) else: if type(batch_size) is not int or batch_size <= 0: raise Exception("batch_size must be a positive integer") else: if self.num_targets == 1: y = np.zeros((X.shape[0], ), dtype=self._output_dtype) else: y = np.zeros((X.shape[0], self.num_targets), dtype=self._output_dtype) # Copied and adapted from the scikit-learn GP code for k in range( max(1, int(np.ceil(float(X.shape[0]) / batch_size)))): batch_from = k * batch_size batch_to = min([(k + 1) * batch_size, X.shape[0]]) y[batch_from:batch_to] = \ self.predict(X[batch_from:batch_to], batch_size=None) return y @classmethod def get_hyperparameter_search_space(cls, include=None, exclude=None, dataset_properties=None): """Return the configuration space for the CASH problem. This method should be called by the method get_hyperparameter_search_space of a subclass. After the subclass assembles a list of available estimators and preprocessor components, _get_hyperparameter_search_space can be called to do the work of creating the actual HPOlibConfigSpace.configuration_space.ConfigurationSpace object. Parameters ---------- estimator_name : str Name of the estimator hyperparameter which will be used in the configuration space. For a classification task, this would be 'classifier'. estimator_components : dict {name: component} Dictionary with all estimator components to be included in the configuration space. preprocessor_components : dict {name: component} Dictionary with all preprocessor components to be included in the configuration space. . always_active : list of str A list of components which will always be active in the pipeline. This is useful for components like imputation which have hyperparameters to be configured, but which do not have any parent. default_estimator : str Default value for the estimator hyperparameter. Returns ------- cs : HPOlibConfigSpace.configuration_space.Configuration The configuration space describing the AutoSklearnClassifier. """ raise NotImplementedError() @classmethod def _get_hyperparameter_search_space(cls, cs, dataset_properties, exclude, include, pipeline): if include is None: include = {} keys = [pair[0] for pair in pipeline] for key in include: if key not in keys: raise ValueError('Invalid key in include: %s; should be one ' 'of %s' % (key, keys)) if exclude is None: exclude = {} keys = [pair[0] for pair in pipeline] for key in exclude: if key not in keys: raise ValueError('Invalid key in exclude: %s; should be one ' 'of %s' % (key, keys)) if 'sparse' not in dataset_properties: # This dataset is probaby dense dataset_properties['sparse'] = False if 'signed' not in dataset_properties: # This dataset probably contains unsigned data dataset_properties['signed'] = False matches = autosklearn.pipeline.create_searchspace_util.get_match_array( pipeline, dataset_properties, include=include, exclude=exclude) # Now we have only legal combinations at this step of the pipeline # Simple sanity checks assert np.sum(matches) != 0, "No valid pipeline found." assert np.sum(matches) <= np.size(matches), \ "'matches' is not binary; %s <= %d, %s" % \ (str(np.sum(matches)), np.size(matches), str(matches.shape)) # Iterate each dimension of the matches array (each step of the # pipeline) to see if we can add a hyperparameter for that step for node_idx, n_ in enumerate(pipeline): node_name, node = n_ is_choice = hasattr(node, "get_available_components") # if the node isn't a choice we can add it immediately because it # must be active (if it wouldn't, np.sum(matches) would be zero if not is_choice: cs.add_configuration_space( node_name, node.get_hyperparameter_search_space(dataset_properties)) # If the node isn't a choice, we have to figure out which of it's # choices are actually legal choices else: choices_list = autosklearn.pipeline.create_searchspace_util.\ find_active_choices(matches, node, node_idx, dataset_properties, include.get(node_name), exclude.get(node_name)) cs.add_configuration_space( node_name, node.get_hyperparameter_search_space(dataset_properties, include=choices_list)) # And now add forbidden parameter configurations # According to matches if np.sum(matches) < np.size(matches): cs = autosklearn.pipeline.create_searchspace_util.add_forbidden( conf_space=cs, pipeline=pipeline, matches=matches, dataset_properties=dataset_properties, include=include, exclude=exclude) return cs def __repr__(self): class_name = self.__class__.__name__ configuration = {} self.configuration._populate_values() for hp_name in self.configuration: if self.configuration[hp_name] is not None: configuration[hp_name] = self.configuration[hp_name] configuration_string = ''.join([ 'configuration={\n ', ',\n '.join([ "'%s': %s" % (hp_name, repr(configuration[hp_name])) for hp_name in sorted(configuration) ]), '}' ]) return '%s(%s)' % (class_name, configuration_string) @classmethod def _get_pipeline(cls): if cls == autosklearn.pipelineBaseEstimator: return [] raise NotImplementedError() def _get_estimator_hyperparameter_name(self): raise NotImplementedError()
class BasePipeline(BaseEstimator): """Base class for all pipeline objects. Notes ----- This class should not be instantiated, only subclassed.""" __metaclass__ = ABCMeta def __init__(self, configuration, task, random_state=None): self.configuration = configuration self.task = task self._output_dtype = np.float32 if random_state is None: self.random_state = check_random_state(1) else: self.random_state = check_random_state(random_state) def fit(self, X, y, fit_params=None, init_params=None): """Fit the selected algorithm to the training data. Parameters ---------- X : array-like or sparse, shape = (n_samples, n_features) Training data. The preferred type of the matrix (dense or sparse) depends on the estimator selected. y : array-like Targets fit_params : dict See the documentation of sklearn.pipeline.Pipeline for formatting instructions. init_params : dict Pass arguments to the constructors of single methods. To pass arguments to only one of the methods (lets says the OneHotEncoder), seperate the class name from the argument by a ':'. Returns ------- self : returns an instance of self. Raises ------ NoModelException NoModelException is raised if fit() is called without specifying a classification algorithm first. """ if y.ndim > 2: raise ValueError("y must be 1d or 2d array") X, fit_params = self.pre_transform(X, y, fit_params=fit_params, init_params=init_params) if y.ndim == 1: self.num_targets = 1 else: self.num_targets = y.shape[1] self.fit_estimator(X, y, fit_params=fit_params) return self def pre_transform(self, X, y, fit_params=None, init_params=None): # Save all transformation object in a list to create a pipeline object steps = [] # seperate the init parameters for the single methods init_params_per_method = defaultdict(dict) if init_params is not None and len(init_params) != 0: for init_param, value in init_params.items(): method, param = init_param.split(":") init_params_per_method[method][param] = value pipeline = get_pipeline(self.task) # Instantiate preprocessor objects for preproc_name, preproc_class in pipeline[:-1]: preproc_params = {} for instantiated_hyperparameter in self.configuration: if not instantiated_hyperparameter.startswith(preproc_name + ":"): continue if self.configuration[instantiated_hyperparameter] is None: continue name_ = instantiated_hyperparameter.split(":")[-1] preproc_params[name_] = self.configuration[ instantiated_hyperparameter] preprocessor_object = preproc_class(random_state=self.random_state, **preproc_params) # Ducktyping... if hasattr(preproc_class, 'get_components'): preprocessor_object = preprocessor_object.choice steps.append((preproc_name, preprocessor_object)) # Extract Estimator Hyperparameters from the configuration object estimator_name = pipeline[-1][0] estimator_object = pipeline[-1][1] estimator_parameters = {} for instantiated_hyperparameter in self.configuration: if not instantiated_hyperparameter.startswith(estimator_name): continue if self.configuration[instantiated_hyperparameter] is None: continue name_ = instantiated_hyperparameter.split(":")[-1] estimator_parameters[name_] = self.configuration[ instantiated_hyperparameter] estimator_parameters.update(init_params_per_method[estimator_name]) estimator_object = estimator_object(random_state=self.random_state, **estimator_parameters) # Ducktyping... if hasattr(estimator_object, 'get_components'): estimator_object = estimator_object.choice steps.append((estimator_name, estimator_object)) self.pipeline_ = Pipeline(steps) if fit_params is None or not isinstance(fit_params, dict): fit_params = dict() else: fit_params = { key.replace(":", "__"): value for key, value in fit_params.items() } X, fit_params = self.pipeline_._pre_transform(X, y, **fit_params) return X, fit_params def fit_estimator(self, X, y, fit_params=None): check_is_fitted(self, 'pipeline_') if fit_params is None: fit_params = {} self.pipeline_.steps[-1][-1].fit(X, y, **fit_params) return self def iterative_fit(self, X, y, fit_params=None, n_iter=1): check_is_fitted(self, 'pipeline_') if fit_params is None: fit_params = {} self.pipeline_.steps[-1][-1].iterative_fit(X, y, n_iter=n_iter, **fit_params) def estimator_supports_iterative_fit(self): return hasattr(self.pipeline_.steps[-1][-1], 'iterative_fit') def configuration_fully_fitted(self): check_is_fitted(self, 'pipeline_') return self.pipeline_.steps[-1][-1].configuration_fully_fitted() def predict(self, X, batch_size=None): """Predict the classes using the selected model. Parameters ---------- X : array-like, shape = (n_samples, n_features) batch_size: int or None, defaults to None batch_size controls whether the pipeline will be called on small chunks of the data. Useful when calling the predict method on the whole array X results in a MemoryError. Returns ------- array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) Returns the predicted values""" assert hasattr(self, 'pipeline_'), "fit() must be called " \ "before call predict()" if batch_size is None: return self.pipeline_.predict(X).astype(self._output_dtype) else: if type(batch_size) is not int or batch_size <= 0: raise Exception("batch_size must be a positive integer") else: if self.num_targets == 1: y = np.zeros((X.shape[0], ), dtype=self._output_dtype) else: y = np.zeros((X.shape[0], self.num_targets), dtype=self._output_dtype) # Copied and adapted from the scikit-learn GP code for k in range( max(1, int(np.ceil(float(X.shape[0]) / batch_size)))): batch_from = k * batch_size batch_to = min([(k + 1) * batch_size, X.shape[0]]) y[batch_from:batch_to] = \ self.predict(X[batch_from:batch_to], batch_size=None) return y def __repr__(self): class_name = self.__class__.__name__ configuration = {} self.configuration._populate_values() for hp_name in self.configuration: if self.configuration[hp_name] is not None: configuration[hp_name] = self.configuration[hp_name] configuration_string = ''.join([ 'configuration={\n ', ',\n '.join([ "'%s': %s" % (hp_name, repr(configuration[hp_name])) for hp_name in sorted(configuration) ]), '}' ]) return '%s(%s)' % (class_name, configuration_string) def _get_estimator_hyperparameter_name(self): raise NotImplementedError()