コード例 #1
0
 def middle_transformations(self, est, X, y):
     if len(est.steps) > 2:
         tmp = Pipeline([(name, obj) for name, obj in est.steps[1:-1]] + [("dummy", DummyClassifier())])
         transformed_data, fit_params = tmp._pre_transform(X, y)
         return transformed_data
     else:
         return X
コード例 #2
0
 def middle_transformations(self, est, X, y):
     if len(est.steps) > 2:
         tmp = Pipeline([(name, obj) for name, obj in est.steps[1:-1]] +
                        [("dummy", DummyClassifier())])
         transformed_data, fit_params = tmp._pre_transform(X, y)
         return transformed_data
     else:
         return X
コード例 #3
0
ファイル: base.py プロジェクト: stokasto/auto-sklearn
class BasePipeline(BaseEstimator):
    """Base class for all pipeline objects.

    Notes
    -----
    This class should not be instantiated, only subclassed."""
    __metaclass__ = ABCMeta

    def __init__(self, configuration, random_state=None):
        self.configuration = configuration

        if random_state is None:
            self.random_state = check_random_state(1)
        else:
            self.random_state = check_random_state(random_state)

    def fit(self, X, y, fit_params=None, init_params=None):
        """Fit the selected algorithm to the training data.

        Parameters
        ----------
        X : array-like or sparse, shape = (n_samples, n_features)
            Training data. The preferred type of the matrix (dense or sparse)
            depends on the estimator selected.

        y : array-like
            Targets

        fit_params : dict
            See the documentation of sklearn.pipeline.Pipeline for formatting
            instructions.

        init_params : dict
            Pass arguments to the constructors of single methods. To pass
            arguments to only one of the methods (lets says the
            OneHotEncoder), seperate the class name from the argument by a ':'.

        Returns
        -------
        self : returns an instance of self.

        Raises
        ------
        NoModelException
            NoModelException is raised if fit() is called without specifying
            a classification algorithm first.
        """
        X, fit_params = self.pre_transform(X, y, fit_params=fit_params,
                                          init_params=init_params)
        self.fit_estimator(X, y, fit_params=fit_params)
        return self

    def pre_transform(self, X, y, fit_params=None, init_params=None):

        # Save all transformation object in a list to create a pipeline object
        steps = []

        # seperate the init parameters for the single methods
        init_params_per_method = defaultdict(dict)
        if init_params is not None and len(init_params) != 0:
            for init_param, value in init_params.items():
                method, param = init_param.split(":")
                init_params_per_method[method][param] = value

        # List of preprocessing steps (and their order)
        preprocessors_names = [preprocessor[0] for
                               preprocessor in self._get_pipeline()[:-1]]

        for preproc_name in preprocessors_names:
            preproc_params = {}
            for instantiated_hyperparameter in self.configuration:
                if not instantiated_hyperparameter.startswith(
                        preproc_name + ":"):
                    continue
                if self.configuration[instantiated_hyperparameter] is None:
                    continue

                name_ = instantiated_hyperparameter.split(":")[-1]
                preproc_params[name_] = self.configuration[
                    instantiated_hyperparameter]

            if preproc_name in \
                    components.feature_preprocessing_components._preprocessors:
                _preprocessors = components.feature_preprocessing_components._preprocessors
            elif preproc_name in \
                    components.data_preprocessing_components._preprocessors:
                _preprocessors = components.data_preprocessing_components._preprocessors
            else:
                raise ValueError(preproc_name)

            preprocessor_object = _preprocessors[preproc_name](
                random_state=self.random_state, **preproc_params)

            # Ducktyping...
            if hasattr(preprocessor_object, 'get_components'):
                preprocessor_object = preprocessor_object.choice

            steps.append((preproc_name, preprocessor_object))

        # Extract Estimator Hyperparameters from the configuration object
        estimator_name = self._get_pipeline()[-1][0]
        estimator_object = self._get_pipeline()[-1][1]
        estimator_parameters = {}
        for instantiated_hyperparameter in self.configuration:
            if not instantiated_hyperparameter.startswith(estimator_name):
                continue
            if self.configuration[instantiated_hyperparameter] is None:
                continue

            name_ = instantiated_hyperparameter.split(":")[-1]
            estimator_parameters[name_] = self.configuration[
                instantiated_hyperparameter]

        estimator_parameters.update(init_params_per_method[estimator_name])
        estimator_object = estimator_object(random_state=self.random_state,
                            **estimator_parameters)

        # Ducktyping...
        if hasattr(estimator_object, 'get_components'):
            estimator_object = estimator_object.choice

        steps.append((estimator_name, estimator_object))

        self.pipeline_ = Pipeline(steps)
        if fit_params is None or not isinstance(fit_params, dict):
            fit_params = dict()
        else:
            fit_params = {key.replace(":", "__"): value for key, value in
                          fit_params.items()}
        X, fit_params = self.pipeline_._pre_transform(X, y, **fit_params)
        return X, fit_params

    def fit_estimator(self, X, y, fit_params=None):
        check_is_fitted(self, 'pipeline_')
        if fit_params is None:
            fit_params = {}
        self.pipeline_.steps[-1][-1].fit(X, y, **fit_params)
        return self

    def iterative_fit(self, X, y, fit_params=None, n_iter=1):
        check_is_fitted(self, 'pipeline_')
        if fit_params is None:
            fit_params = {}
        self.pipeline_.steps[-1][-1].iterative_fit(X, y, n_iter=n_iter,
                                                   **fit_params)

    def estimator_supports_iterative_fit(self):
        return hasattr(self.pipeline_.steps[-1][-1], 'iterative_fit')

    def configuration_fully_fitted(self):
        check_is_fitted(self, 'pipeline_')
        return self.pipeline_.steps[-1][-1].configuration_fully_fitted()

    def predict(self, X, batch_size=None):
        """Predict the classes using the selected model.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        batch_size: int or None, defaults to None
            batch_size controls whether the pipeline will be
            called on small chunks of the data. Useful when calling the
            predict method on the whole array X results in a MemoryError.

        Returns
        -------
        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
            Returns the predicted values"""
        # TODO check if fit() was called before...

        if batch_size is None:
            return self.pipeline_.predict(X)
        else:
            if type(batch_size) is not int or batch_size <= 0:
                raise Exception("batch_size must be a positive integer")

            else:
                if self.num_targets == 1:
                    y = np.zeros((X.shape[0],))
                else:
                    y = np.zeros((X.shape[0], self.num_targets))

                # Copied and adapted from the scikit-learn GP code
                for k in range(max(1, int(np.ceil(float(X.shape[0]) /
                                                  batch_size)))):
                    batch_from = k * batch_size
                    batch_to = min([(k + 1) * batch_size, X.shape[0]])
                    y[batch_from:batch_to] = \
                        self.predict(X[batch_from:batch_to], batch_size=None)

                return y

    @classmethod
    def get_hyperparameter_search_space(cls, include=None, exclude=None,
                                        dataset_properties=None):
        """Return the configuration space for the CASH problem.

        This method should be called by the method
        get_hyperparameter_search_space of a subclass. After the subclass
        assembles a list of available estimators and preprocessor components,
        _get_hyperparameter_search_space can be called to do the work of
        creating the actual
        HPOlibConfigSpace.configuration_space.ConfigurationSpace object.

        Parameters
        ----------
        estimator_name : str
            Name of the estimator hyperparameter which will be used in the
            configuration space. For a classification task, this would be
            'classifier'.

        estimator_components : dict {name: component}
            Dictionary with all estimator components to be included in the
            configuration space.

        preprocessor_components : dict {name: component}
            Dictionary with all preprocessor components to be included in the
            configuration space. .

        always_active : list of str
            A list of components which will always be active in the pipeline.
            This is useful for components like imputation which have
            hyperparameters to be configured, but which do not have any parent.

        default_estimator : str
            Default value for the estimator hyperparameter.

        Returns
        -------
        cs : HPOlibConfigSpace.configuration_space.Configuration
            The configuration space describing the AutoSklearnClassifier.

        """
        raise NotImplementedError()

    @classmethod
    def _get_hyperparameter_search_space(cls, cs, dataset_properties, exclude,
                                         include, pipeline):
        if include is None:
            include = {}

        keys = [pair[0] for pair in pipeline]
        for key in include:
            if key not in keys:
                raise ValueError('Invalid key in include: %s; should be one '
                                 'of %s' % (key, keys))

        if exclude is None:
            exclude = {}

        keys = [pair[0] for pair in pipeline]
        for key in exclude:
            if key not in keys:
                raise ValueError('Invalid key in exclude: %s; should be one '
                                 'of %s' % (key, keys))

        if 'sparse' not in dataset_properties:
            # This dataset is probaby dense
            dataset_properties['sparse'] = False
        if 'signed' not in dataset_properties:
            # This dataset probably contains unsigned data
            dataset_properties['signed'] = False

        matches = autosklearn.pipeline.create_searchspace_util.get_match_array(
            pipeline, dataset_properties, include=include, exclude=exclude)

        # Now we have only legal combinations at this step of the pipeline
        # Simple sanity checks
        assert np.sum(matches) != 0, "No valid pipeline found."

        assert np.sum(matches) <= np.size(matches), \
            "'matches' is not binary; %s <= %d, %s" % \
            (str(np.sum(matches)), np.size(matches), str(matches.shape))

        # Iterate each dimension of the matches array (each step of the
        # pipeline) to see if we can add a hyperparameter for that step
        for node_idx, n_ in enumerate(pipeline):
            node_name, node = n_
            is_choice = hasattr(node, "get_available_components")

            # if the node isn't a choice we can add it immediately because it
            #  must be active (if it wouldn't, np.sum(matches) would be zero
            if not is_choice:
                cs.add_configuration_space(node_name,
                    node.get_hyperparameter_search_space(dataset_properties))
            # If the node isn't a choice, we have to figure out which of it's
            #  choices are actually legal choices
            else:
                choices_list = autosklearn.pipeline.create_searchspace_util.\
                    find_active_choices(matches, node, node_idx,
                                        dataset_properties,
                                        include.get(node_name),
                                        exclude.get(node_name))
                cs.add_configuration_space(node_name,
                    node.get_hyperparameter_search_space(
                        dataset_properties, include=choices_list))
        # And now add forbidden parameter configurations
        # According to matches
        if np.sum(matches) < np.size(matches):
            cs = autosklearn.pipeline.create_searchspace_util.add_forbidden(
                conf_space=cs, pipeline=pipeline, matches=matches,
                dataset_properties=dataset_properties, include=include,
                exclude=exclude)

        return cs

    def __repr__(self):
        class_name = self.__class__.__name__

        configuration = {}
        self.configuration._populate_values()
        for hp_name in self.configuration:
            if self.configuration[hp_name] is not None:
                configuration[hp_name] = self.configuration[hp_name]

        configuration_string = ''.join(
            ['configuration={\n  ',
             ',\n  '.join(["'%s': %s" % (hp_name, repr(configuration[hp_name]))
                                         for hp_name in sorted(configuration)]),
             '}'])

        return '%s(%s)' % (class_name, configuration_string)

    @classmethod
    def _get_pipeline(cls):
        if cls == autosklearn.pipelineBaseEstimator:
            return []
        raise NotImplementedError()

    def _get_estimator_hyperparameter_name(self):
        raise NotImplementedError()
コード例 #4
0
ファイル: strf.py プロジェクト: kingjr/ecogtools
class EncodingModel(object):
    def __init__(self, delays=None, est=None, scorer=None, preproc_y=True):
        """Fit a STRF model.

        Fit a receptive field using time lags and a custom estimator or
        pipeline. This implementation uses Ridge regression and scikit-learn.
        It creates time lags for the input matrix, then does cross validation
        to fit a STRF model.

        Parameters
        ----------
        delays : array, shape (n_delays,)
            The delays to include when creating time lags. The input array X
            will end up having shape (n_feats * n_delays, n_times)
        est : list instance of sklearn estimator | pipeline with estimator
            The estimator to use for fitting. This may be a pipeline, in which
            case the final estimator must create a `coef_` attribute after
            fitting. If an estimator is passed, it also must produce a `coef_`
            attribute after fitting. If estimator is type `GridSearchCV`, then
            a grid search will be performed on each CV iteration (using the cv
            object stored in GridSearchCV). Extra attributes will be generated.
            (see `fit` documentation)
        scorer : function | None
            The scorer to use when evaluating on the held-out test set.
            It must accept two 1-d arrays as inputs (the true values first,
            and predicted values second), and output a scalar value.
            If None, it will be mean squared error.
        preproc_y : bool
            Whether to apply the preprocessing steps of the estimator used in
            fitting on the predictor variables prior to model fitting.
        """
        self.delays = np.array([0]) if delays is None else delays
        self.n_delays = len(self.delays)
        self.est = Ridge() if est is None else est
        self.scorer = mean_squared_error if scorer is None else scorer
        self.preproc_y = preproc_y

    def fit(self, X, y, sfreq, times=None, tmin=None, tmax=None, cv=None,
            preproc_y=False, cv_params=None, feat_names=None, verbose=False):
        """Fit the model.

        Fits a receptive field model. Model results are stored as attributes.

        Parameters
        ----------
        X : array, shape (n_epochs, n_feats, n_times)
            The input data for the regression
        y : array, shape (n_epochs, n_times,)
            The output data for the regression
        sfreq : float
            The sampling frequency for the time dimension
        times : array, shape (n_times,)
            The times corresponding to the final axis of x/y. Is used to
            specify subsets of time per trial (using tmin/tmax)
        tmin : float | array, shape (n_epochs,)
            The beginning time for each epoch. Optionally a different time
            for each epoch may be provided.
        tmax : float | array, shape (n_epochs,)
            The end time for each epoch. Optionally a different time for each
            epoch may be provided.
        cv : int | instance of (KFold, LabelShuffleSplit)
            The cross validation object to use for the outer loop
        feat_names : list of strings/ints/floats, shape (n_feats,) : None
            A list of values corresponding to input features. Useful for
            keeping track of the coefficients in the model after time lagging.
        verbose : bool
            If True, will display a progress bar during fits for CVs remaining.

        Attributes
        ----------
        coef_ : array, shape (n_features * n_lags)
            The average coefficients across CV splits
        coefs_all_ : array, shape(n_cv, n_features * n_lags)
            The raw coefficients for each iteration of cross-validation.
        coef_names : array, shape (n_features * n_lags, 2)
            A list of coefficient names, useful for keeping track of time lags
        scores_ : array, shape (n_cv,)
            Prediction scores for each cross-validation split on the held-out
            test set. Scores are outputs of the `scorer` attribute function.
        best_estimators_ : list of estimators, shape (n_cv,)
            If initial estimator is type `GridSearchCV`, this is the list of
            chosen estimators on each cv split.
        best_params_ : list of dicts, shape (n_cv,)
            If initial estimator is type `GridSearchCV`, this is the list of
            chosen parameters on each cv split.
        """
        if feat_names is not None:
            if len(feat_names) != X.shape[1]:
                raise ValueError(
                    'feat_names and X.shape[0] must be the same size')
        if times is None:
            times = np.arange(X.shape[-1]) / float(sfreq)
        self.tmin = times[0] if tmin is None else tmin
        self.tmax = times[-1] if tmax is None else tmax
        self.times = times
        self.sfreq = sfreq

        # Delay X
        X, y, labels, names = _build_design_matrix(X, y, sfreq, self.times,
                                                   self.delays, self.tmin,
                                                   self.tmax, feat_names)
        self.feat_names = np.array(names)
        cv = _check_cv(X, labels, cv, cv_params)

        # Define names for input variabels to keep track of time delays
        X_names = [(feat, delay)
                   for delay in self.delays for feat in self.feat_names]
        self.coef_names = np.array(X_names)

        # Build model instance
        if not isinstance(self.est, Pipeline):
            self.est = Pipeline([('est', self.est)])

        # Create model metadata that we'll add to the obj later
        model_data = dict(coefs_all_=[], scores_=[])
        if isinstance(self.est.steps[-1][-1], GridSearchCV):
            model_data.update(dict(best_estimators_=[], best_params_=[]))

        # Fit the model and collect model results
        if verbose is True:
            cv = tqdm(cv)
        for i, (tr, tt) in enumerate(cv):
            X_tr = X[:, tr].T
            X_tt = X[:, tt].T
            y_tr = y[tr]
            y_tt = y[tt]

            if self.preproc_y:
                y_tr, y_tt = [self.est._pre_transform(i)[0] for i in [y_tr, y_tt]]
            self.est.fit(X_tr, y_tr)

            mod = deepcopy(self.est.steps[-1][-1])
            if isinstance(mod, GridSearchCV):
                # If it's a GridSearch, then add a "best_params" object
                # Assume hyperparameter search
                if mod.refit:
                    model_data['best_estimators_'].append(mod.best_estimator_)
                    model_data['coefs_all_'].append(mod.best_estimator_.coef_)
                model_data['best_params_'].append(mod.best_params_)
            else:
                model_data['coefs_all_'].append(mod.coef_)

            # Fit model + make predictions
            scr = self.scorer(y_tt, self.est.predict(X_tt))
            model_data['scores_'].append(scr)

        for key, val in model_data.iteritems():
            setattr(self, key, np.array(val))
        self.coefs_ = np.mean(self.coefs_all_, axis=0)
        self.cv = cv

    def predict(self, X):
        """Generate predictions using a fit receptive field model.

        This uses the `coef_` attribute for predictions.
        """
        X_lag = delay_timeseries(X, self.sfreq, self.delays)

        Xt = self.est._pre_transform(X_lag.T)[0]
        return np.dot(Xt, self.coefs_)

    def coefs_as_series(self, agg=None):
        """Return the raw coefficients as a pandas series.

        Parameters
        ----------
        agg : None | function
            If agg is None, all coefs across CVs will be returned. If it
            is a function, it will be applied across CVs and the output
            will be shape (n_coefficients,).

        Outputs
        -------
        sr : pandas Series, shape (n_coefficients,) | (n_cv * n_coefficients)
            The coefficients as a pandas series object.
        """
        ix = pd.MultiIndex.from_tuples(self.coef_names, names=['feat', 'lag'])
        if agg is None:
            sr = []
            for icv, icoef in enumerate(self.coefs_all_):
                isr = pd.DataFrame(icoef[:, np.newaxis], index=ix)
                isr['cv'] = icv
                isr = isr.set_index('cv', append=True).squeeze()
                sr.append(isr)
            sr = pd.concat(sr, axis=0)
        else:
            coefs = agg(self.coefs_all_, axis=0)
            sr = pd.Series(coefs, index=ix)
        return sr

    def plot_coefficients(self, agg=None, ax=None, cmap=None,
                          interpolation='nearest', aspect='auto', **kwargs):
        """Plot the coefficients as a 2D heatmap.

        The plot will be shape (n_features, n_lags)
        """
        from matplotlib import pyplot as plt
        cmap = plt.cm.RdBu_r if cmap is None else cmap
        agg = np.mean if agg is None else agg
        if ax is None:
            f, ax = plt.subplots()
        df = self.coefs_as_series(agg=agg).unstack('lag')
        im = ax.imshow(df.values, cmap=cmap, interpolation=interpolation,
                       aspect=aspect, **kwargs)

        for lab in ax.get_xticklabels():
            lab.set_text(df.columns[int(lab.get_position()[0])])

        for lab in ax.get_yticklabels():
            lab.set_text(df.index[int(lab.get_position()[1])])

        ax.set_xlabel('Time delays (s)')
        ax.set_ylabel('Features')
        return ax
コード例 #5
0
ファイル: fit.py プロジェクト: choldgraf/pySTRF
class EncodingModel(object):
    def __init__(self, delays=None, est=None, scorer=None, preproc_y=True):
        """Fit a STRF model.

        Fit a receptive field using time lags and a custom estimator or
        pipeline. This implementation uses Ridge regression and scikit-learn.
        It creates time lags for the input matrix, then does cross validation
        to fit a STRF model.

        Parameters
        ----------
        delays : array, shape (n_delays,)
            The delays to include when creating time lags. The input array X
            will end up having shape (n_feats * n_delays, n_times)
        est : list instance of sklearn estimator | pipeline with estimator
            The estimator to use for fitting. This may be a pipeline, in which
            case the final estimator must create a `coef_` attribute after
            fitting. If an estimator is passed, it also must produce a `coef_`
            attribute after fitting. If estimator is type `GridSearchCV`, then
            a grid search will be performed on each CV iteration (using the cv
            object stored in GridSearchCV). Extra attributes will be generated.
            (see `fit` documentation)
        scorer : function | None
            The scorer to use when evaluating on the held-out test set.
            It must accept two 1-d arrays as inputs (the true values first,
            and predicted values second), and output a scalar value.
            If None, it will be mean squared error.
        preproc_y : bool
            Whether to apply the preprocessing steps of the estimator used in
            fitting on the predictor variables prior to model fitting.

        References
        ----------
        [1] Theunissen, F. E. et al. Estimating spatio-temporal receptive
                fields of auditory and visual neurons from their responses to
                natural stimuli. Network 12, 289–316 (2001).
        [2] Willmore, B. & Smyth, D. Methods for first-order kernel estimation:
                simple-cell receptive fields from responses to natural scenes.
                Network 14, 553–77 (2003).
        """
        self.delays = np.array([0]) if delays is None else delays
        self.n_delays = len(self.delays)
        self.est = Ridge() if est is None else est
        self.scorer = mean_squared_error if scorer is None else scorer
        self.preproc_y = preproc_y

    def fit(self,
            X,
            y,
            sfreq,
            times=None,
            tmin=None,
            tmax=None,
            cv=None,
            cv_params=None,
            feat_names=None,
            verbose=False):
        """Fit the model.

        Fits a receptive field model. Model results are stored as attributes.

        Parameters
        ----------
        X : array, shape (n_epochs, n_feats, n_times)
            The input data for the regression
        y : array, shape (n_epochs, n_times,)
            The output data for the regression
        sfreq : float
            The sampling frequency for the time dimension
        times : array, shape (n_times,)
            The times corresponding to the final axis of x/y. Is used to
            specify subsets of time per trial (using tmin/tmax)
        tmin : float | array, shape (n_epochs,)
            The beginning time for each epoch. Optionally a different time
            for each epoch may be provided.
        tmax : float | array, shape (n_epochs,)
            The end time for each epoch. Optionally a different time for each
            epoch may be provided.
        cv : int | instance of (KFold, LabelShuffleSplit)
            The cross validation object to use for the outer loop
        feat_names : list of strings/ints/floats, shape (n_feats,) : None
            A list of values corresponding to input features. Useful for
            keeping track of the coefficients in the model after time lagging.
        verbose : bool
            If True, will display a progress bar during fits for CVs remaining.

        Attributes
        ----------
        coef_ : array, shape (n_features * n_lags)
            The average coefficients across CV splits
        coefs_all_ : array, shape(n_cv, n_features * n_lags)
            The raw coefficients for each iteration of cross-validation.
        coef_names : array, shape (n_features * n_lags, 2)
            A list of coefficient names, useful for keeping track of time lags
        scores_ : array, shape (n_cv,)
            Prediction scores for each cross-validation split on the held-out
            test set. Scores are outputs of the `scorer` attribute function.
        best_estimators_ : list of estimators, shape (n_cv,)
            If initial estimator is type `GridSearchCV`, this is the list of
            chosen estimators on each cv split.
        best_params_ : list of dicts, shape (n_cv,)
            If initial estimator is type `GridSearchCV`, this is the list of
            chosen parameters on each cv split.
        """
        if feat_names is not None:
            if len(feat_names) != X.shape[1]:
                raise ValueError(
                    'feat_names and X.shape[0] must be the same size')
        if times is None:
            times = np.arange(X.shape[-1]) / float(sfreq)
        self.tmin = times[0] if tmin is None else tmin
        self.tmax = times[-1] if tmax is None else tmax
        self.times = times
        self.sfreq = sfreq

        # Delay X
        X, y, labels, names = _build_design_matrix(X, y, sfreq, self.times,
                                                   self.delays, self.tmin,
                                                   self.tmax, feat_names)
        self.feat_names = np.array(names)
        cv = _check_cv(X, labels, cv, cv_params)

        # Define names for input variabels to keep track of time delays
        X_names = [(feat, delay) for delay in self.delays
                   for feat in self.feat_names]
        self.coef_names = np.array(X_names)

        # Build model instance
        if not isinstance(self.est, Pipeline):
            self.est = Pipeline([('est', self.est)])

        # Create model metadata that we'll add to the obj later
        model_data = dict(coefs_all_=[], scores_=[])
        if isinstance(self.est.steps[-1][-1], GridSearchCV):
            model_data.update(dict(best_estimators_=[], best_params_=[]))

        # Fit the model and collect model results
        if verbose is True:
            cv = tqdm(cv)
        for i, (tr, tt) in enumerate(cv):
            X_tr = X[:, tr].T
            X_tt = X[:, tt].T
            y_tr = y[tr, np.newaxis]
            y_tt = y[tt, np.newaxis]

            if self.preproc_y:
                y_tr, y_tt = [
                    self.est._pre_transform(i)[0] for i in [y_tr, y_tt]
                ]
            self.est.fit(X_tr, y_tr)

            mod = deepcopy(self.est.steps[-1][-1])
            if isinstance(mod, GridSearchCV):
                # If it's a GridSearch, then add a "best_params" object
                # Assume hyperparameter search
                if mod.refit:
                    model_data['best_estimators_'].append(mod.best_estimator_)
                    model_data['coefs_all_'].append(mod.best_estimator_.coef_)
                model_data['best_params_'].append(mod.best_params_)
            else:
                model_data['coefs_all_'].append(mod.coef_)

            # Fit model + make predictions
            scr = self.scorer(y_tt, self.est.predict(X_tt))
            model_data['scores_'].append(scr)

        for key, val in model_data.iteritems():
            setattr(self, key, np.array(val))
        self.coefs_ = np.mean(self.coefs_all_, axis=0)
        self.cv = cv

    def predict(self, X):
        """Generate predictions using a fit receptive field model.

        This uses the `coef_` attribute for predictions.
        """
        X_lag = delay_timeseries(X, self.sfreq, self.delays)

        Xt = self.est._pre_transform(X_lag.T)[0]
        return np.dot(Xt, self.coefs_)

    def coefs_as_series(self, agg=None):
        """Return the raw coefficients as a pandas series.

        Parameters
        ----------
        agg : None | function
            If agg is None, all coefs across CVs will be returned. If it
            is a function, it will be applied across CVs and the output
            will be shape (n_coefficients,).

        Outputs
        -------
        sr : pandas Series, shape (n_coefficients,) | (n_cv * n_coefficients)
            The coefficients as a pandas series object.
        """
        ix = pd.MultiIndex.from_tuples(self.coef_names, names=['feat', 'lag'])
        if agg is None:
            sr = []
            for icv, icoef in enumerate(self.coefs_all_):
                isr = pd.DataFrame(icoef[:, np.newaxis], index=ix)
                isr['cv'] = icv
                isr = isr.set_index('cv', append=True).squeeze()
                sr.append(isr)
            sr = pd.concat(sr, axis=0)
        else:
            coefs = agg(self.coefs_all_, axis=0)
            sr = pd.Series(coefs, index=ix)
        return sr

    def plot_coefficients(self,
                          agg=None,
                          ax=None,
                          cmap=None,
                          interpolation='nearest',
                          aspect='auto',
                          **kwargs):
        """Plot the coefficients as a 2D heatmap.

        The plot will be shape (n_features, n_lags)
        """
        from matplotlib import pyplot as plt
        cmap = plt.cm.RdBu_r if cmap is None else cmap
        agg = np.mean if agg is None else agg
        if ax is None:
            f, ax = plt.subplots()
        df = self.coefs_as_series(agg=agg).unstack('lag')
        im = ax.imshow(df.values,
                       cmap=cmap,
                       interpolation=interpolation,
                       aspect=aspect,
                       **kwargs)

        for lab in ax.get_xticklabels():
            lab.set_text(df.columns[int(lab.get_position()[0])])

        for lab in ax.get_yticklabels():
            lab.set_text(df.index[int(lab.get_position()[1])])

        ax.set_xlabel('Time delays (s)')
        ax.set_ylabel('Features')
        return ax
コード例 #6
0
ファイル: base.py プロジェクト: wanjinchang/auto-sklearn
class BasePipeline(BaseEstimator):
    """Base class for all pipeline objects.

    Notes
    -----
    This class should not be instantiated, only subclassed."""
    __metaclass__ = ABCMeta

    def __init__(self, configuration, random_state=None):
        self.configuration = configuration

        if random_state is None:
            self.random_state = check_random_state(1)
        else:
            self.random_state = check_random_state(random_state)

    def fit(self, X, y, fit_params=None, init_params=None):
        """Fit the selected algorithm to the training data.

        Parameters
        ----------
        X : array-like or sparse, shape = (n_samples, n_features)
            Training data. The preferred type of the matrix (dense or sparse)
            depends on the estimator selected.

        y : array-like
            Targets

        fit_params : dict
            See the documentation of sklearn.pipeline.Pipeline for formatting
            instructions.

        init_params : dict
            Pass arguments to the constructors of single methods. To pass
            arguments to only one of the methods (lets says the
            OneHotEncoder), seperate the class name from the argument by a ':'.

        Returns
        -------
        self : returns an instance of self.

        Raises
        ------
        NoModelException
            NoModelException is raised if fit() is called without specifying
            a classification algorithm first.
        """
        X, fit_params = self.pre_transform(X,
                                           y,
                                           fit_params=fit_params,
                                           init_params=init_params)
        self.fit_estimator(X, y, fit_params=fit_params)
        return self

    def pre_transform(self, X, y, fit_params=None, init_params=None):

        # Save all transformation object in a list to create a pipeline object
        steps = []

        # seperate the init parameters for the single methods
        init_params_per_method = defaultdict(dict)
        if init_params is not None and len(init_params) != 0:
            for init_param, value in init_params.items():
                method, param = init_param.split(":")
                init_params_per_method[method][param] = value

        # Instantiate preprocessor objects
        for preproc_name, preproc_class in self._get_pipeline()[:-1]:
            preproc_params = {}
            for instantiated_hyperparameter in self.configuration:
                if not instantiated_hyperparameter.startswith(preproc_name +
                                                              ":"):
                    continue
                if self.configuration[instantiated_hyperparameter] is None:
                    continue

                name_ = instantiated_hyperparameter.split(":")[-1]
                preproc_params[name_] = self.configuration[
                    instantiated_hyperparameter]

            preprocessor_object = preproc_class(random_state=self.random_state,
                                                **preproc_params)

            # Ducktyping...
            if hasattr(preproc_class, 'get_components'):
                preprocessor_object = preprocessor_object.choice

            steps.append((preproc_name, preprocessor_object))

        # Extract Estimator Hyperparameters from the configuration object
        estimator_name = self._get_pipeline()[-1][0]
        estimator_object = self._get_pipeline()[-1][1]
        estimator_parameters = {}
        for instantiated_hyperparameter in self.configuration:
            if not instantiated_hyperparameter.startswith(estimator_name):
                continue
            if self.configuration[instantiated_hyperparameter] is None:
                continue

            name_ = instantiated_hyperparameter.split(":")[-1]
            estimator_parameters[name_] = self.configuration[
                instantiated_hyperparameter]

        estimator_parameters.update(init_params_per_method[estimator_name])
        estimator_object = estimator_object(random_state=self.random_state,
                                            **estimator_parameters)

        # Ducktyping...
        if hasattr(estimator_object, 'get_components'):
            estimator_object = estimator_object.choice

        steps.append((estimator_name, estimator_object))

        self.pipeline_ = Pipeline(steps)
        if fit_params is None or not isinstance(fit_params, dict):
            fit_params = dict()
        else:
            fit_params = {
                key.replace(":", "__"): value
                for key, value in fit_params.items()
            }
        X, fit_params = self.pipeline_._pre_transform(X, y, **fit_params)
        return X, fit_params

    def fit_estimator(self, X, y, fit_params=None):
        check_is_fitted(self, 'pipeline_')
        if fit_params is None:
            fit_params = {}
        self.pipeline_.steps[-1][-1].fit(X, y, **fit_params)
        return self

    def iterative_fit(self, X, y, fit_params=None, n_iter=1):
        check_is_fitted(self, 'pipeline_')
        if fit_params is None:
            fit_params = {}
        self.pipeline_.steps[-1][-1].iterative_fit(X,
                                                   y,
                                                   n_iter=n_iter,
                                                   **fit_params)

    def estimator_supports_iterative_fit(self):
        return hasattr(self.pipeline_.steps[-1][-1], 'iterative_fit')

    def configuration_fully_fitted(self):
        check_is_fitted(self, 'pipeline_')
        return self.pipeline_.steps[-1][-1].configuration_fully_fitted()

    def predict(self, X, batch_size=None):
        """Predict the classes using the selected model.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        batch_size: int or None, defaults to None
            batch_size controls whether the pipeline will be
            called on small chunks of the data. Useful when calling the
            predict method on the whole array X results in a MemoryError.

        Returns
        -------
        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
            Returns the predicted values"""
        # TODO check if fit() was called before...

        if batch_size is None:
            return self.pipeline_.predict(X).astype(self._output_dtype)
        else:
            if type(batch_size) is not int or batch_size <= 0:
                raise Exception("batch_size must be a positive integer")

            else:
                if self.num_targets == 1:
                    y = np.zeros((X.shape[0], ), dtype=self._output_dtype)
                else:
                    y = np.zeros((X.shape[0], self.num_targets),
                                 dtype=self._output_dtype)

                # Copied and adapted from the scikit-learn GP code
                for k in range(
                        max(1, int(np.ceil(float(X.shape[0]) / batch_size)))):
                    batch_from = k * batch_size
                    batch_to = min([(k + 1) * batch_size, X.shape[0]])
                    y[batch_from:batch_to] = \
                        self.predict(X[batch_from:batch_to], batch_size=None)

                return y

    @classmethod
    def get_hyperparameter_search_space(cls,
                                        include=None,
                                        exclude=None,
                                        dataset_properties=None):
        """Return the configuration space for the CASH problem.

        This method should be called by the method
        get_hyperparameter_search_space of a subclass. After the subclass
        assembles a list of available estimators and preprocessor components,
        _get_hyperparameter_search_space can be called to do the work of
        creating the actual
        HPOlibConfigSpace.configuration_space.ConfigurationSpace object.

        Parameters
        ----------
        estimator_name : str
            Name of the estimator hyperparameter which will be used in the
            configuration space. For a classification task, this would be
            'classifier'.

        estimator_components : dict {name: component}
            Dictionary with all estimator components to be included in the
            configuration space.

        preprocessor_components : dict {name: component}
            Dictionary with all preprocessor components to be included in the
            configuration space. .

        always_active : list of str
            A list of components which will always be active in the pipeline.
            This is useful for components like imputation which have
            hyperparameters to be configured, but which do not have any parent.

        default_estimator : str
            Default value for the estimator hyperparameter.

        Returns
        -------
        cs : HPOlibConfigSpace.configuration_space.Configuration
            The configuration space describing the AutoSklearnClassifier.

        """
        raise NotImplementedError()

    @classmethod
    def _get_hyperparameter_search_space(cls, cs, dataset_properties, exclude,
                                         include, pipeline):
        if include is None:
            include = {}

        keys = [pair[0] for pair in pipeline]
        for key in include:
            if key not in keys:
                raise ValueError('Invalid key in include: %s; should be one '
                                 'of %s' % (key, keys))

        if exclude is None:
            exclude = {}

        keys = [pair[0] for pair in pipeline]
        for key in exclude:
            if key not in keys:
                raise ValueError('Invalid key in exclude: %s; should be one '
                                 'of %s' % (key, keys))

        if 'sparse' not in dataset_properties:
            # This dataset is probaby dense
            dataset_properties['sparse'] = False
        if 'signed' not in dataset_properties:
            # This dataset probably contains unsigned data
            dataset_properties['signed'] = False

        matches = autosklearn.pipeline.create_searchspace_util.get_match_array(
            pipeline, dataset_properties, include=include, exclude=exclude)

        # Now we have only legal combinations at this step of the pipeline
        # Simple sanity checks
        assert np.sum(matches) != 0, "No valid pipeline found."

        assert np.sum(matches) <= np.size(matches), \
            "'matches' is not binary; %s <= %d, %s" % \
            (str(np.sum(matches)), np.size(matches), str(matches.shape))

        # Iterate each dimension of the matches array (each step of the
        # pipeline) to see if we can add a hyperparameter for that step
        for node_idx, n_ in enumerate(pipeline):
            node_name, node = n_
            is_choice = hasattr(node, "get_available_components")

            # if the node isn't a choice we can add it immediately because it
            #  must be active (if it wouldn't, np.sum(matches) would be zero
            if not is_choice:
                cs.add_configuration_space(
                    node_name,
                    node.get_hyperparameter_search_space(dataset_properties))
            # If the node isn't a choice, we have to figure out which of it's
            #  choices are actually legal choices
            else:
                choices_list = autosklearn.pipeline.create_searchspace_util.\
                    find_active_choices(matches, node, node_idx,
                                        dataset_properties,
                                        include.get(node_name),
                                        exclude.get(node_name))
                cs.add_configuration_space(
                    node_name,
                    node.get_hyperparameter_search_space(dataset_properties,
                                                         include=choices_list))
        # And now add forbidden parameter configurations
        # According to matches
        if np.sum(matches) < np.size(matches):
            cs = autosklearn.pipeline.create_searchspace_util.add_forbidden(
                conf_space=cs,
                pipeline=pipeline,
                matches=matches,
                dataset_properties=dataset_properties,
                include=include,
                exclude=exclude)

        return cs

    def __repr__(self):
        class_name = self.__class__.__name__

        configuration = {}
        self.configuration._populate_values()
        for hp_name in self.configuration:
            if self.configuration[hp_name] is not None:
                configuration[hp_name] = self.configuration[hp_name]

        configuration_string = ''.join([
            'configuration={\n  ', ',\n  '.join([
                "'%s': %s" % (hp_name, repr(configuration[hp_name]))
                for hp_name in sorted(configuration)
            ]), '}'
        ])

        return '%s(%s)' % (class_name, configuration_string)

    @classmethod
    def _get_pipeline(cls):
        if cls == autosklearn.pipelineBaseEstimator:
            return []
        raise NotImplementedError()

    def _get_estimator_hyperparameter_name(self):
        raise NotImplementedError()
コード例 #7
0
ファイル: base.py プロジェクト: mayurmorin/AutoML-Challenge
class BasePipeline(BaseEstimator):
    """Base class for all pipeline objects.

    Notes
    -----
    This class should not be instantiated, only subclassed."""
    __metaclass__ = ABCMeta

    def __init__(self, configuration, task, random_state=None):
        self.configuration = configuration
        self.task = task
        self._output_dtype = np.float32

        if random_state is None:
            self.random_state = check_random_state(1)
        else:
            self.random_state = check_random_state(random_state)

    def fit(self, X, y, fit_params=None, init_params=None):
        """Fit the selected algorithm to the training data.

        Parameters
        ----------
        X : array-like or sparse, shape = (n_samples, n_features)
            Training data. The preferred type of the matrix (dense or sparse)
            depends on the estimator selected.

        y : array-like
            Targets

        fit_params : dict
            See the documentation of sklearn.pipeline.Pipeline for formatting
            instructions.

        init_params : dict
            Pass arguments to the constructors of single methods. To pass
            arguments to only one of the methods (lets says the
            OneHotEncoder), seperate the class name from the argument by a ':'.

        Returns
        -------
        self : returns an instance of self.

        Raises
        ------
        NoModelException
            NoModelException is raised if fit() is called without specifying
            a classification algorithm first.
        """
        if y.ndim > 2:
            raise ValueError("y must be 1d or 2d array")

        X, fit_params = self.pre_transform(X,
                                           y,
                                           fit_params=fit_params,
                                           init_params=init_params)
        if y.ndim == 1:
            self.num_targets = 1
        else:
            self.num_targets = y.shape[1]
        self.fit_estimator(X, y, fit_params=fit_params)
        return self

    def pre_transform(self, X, y, fit_params=None, init_params=None):

        # Save all transformation object in a list to create a pipeline object
        steps = []

        # seperate the init parameters for the single methods
        init_params_per_method = defaultdict(dict)
        if init_params is not None and len(init_params) != 0:
            for init_param, value in init_params.items():
                method, param = init_param.split(":")
                init_params_per_method[method][param] = value

        pipeline = get_pipeline(self.task)

        # Instantiate preprocessor objects
        for preproc_name, preproc_class in pipeline[:-1]:
            preproc_params = {}
            for instantiated_hyperparameter in self.configuration:
                if not instantiated_hyperparameter.startswith(preproc_name +
                                                              ":"):
                    continue
                if self.configuration[instantiated_hyperparameter] is None:
                    continue

                name_ = instantiated_hyperparameter.split(":")[-1]
                preproc_params[name_] = self.configuration[
                    instantiated_hyperparameter]

            preprocessor_object = preproc_class(random_state=self.random_state,
                                                **preproc_params)

            # Ducktyping...
            if hasattr(preproc_class, 'get_components'):
                preprocessor_object = preprocessor_object.choice

            steps.append((preproc_name, preprocessor_object))

        # Extract Estimator Hyperparameters from the configuration object
        estimator_name = pipeline[-1][0]
        estimator_object = pipeline[-1][1]
        estimator_parameters = {}
        for instantiated_hyperparameter in self.configuration:
            if not instantiated_hyperparameter.startswith(estimator_name):
                continue
            if self.configuration[instantiated_hyperparameter] is None:
                continue

            name_ = instantiated_hyperparameter.split(":")[-1]
            estimator_parameters[name_] = self.configuration[
                instantiated_hyperparameter]

        estimator_parameters.update(init_params_per_method[estimator_name])
        estimator_object = estimator_object(random_state=self.random_state,
                                            **estimator_parameters)

        # Ducktyping...
        if hasattr(estimator_object, 'get_components'):
            estimator_object = estimator_object.choice

        steps.append((estimator_name, estimator_object))

        self.pipeline_ = Pipeline(steps)
        if fit_params is None or not isinstance(fit_params, dict):
            fit_params = dict()
        else:
            fit_params = {
                key.replace(":", "__"): value
                for key, value in fit_params.items()
            }
        X, fit_params = self.pipeline_._pre_transform(X, y, **fit_params)
        return X, fit_params

    def fit_estimator(self, X, y, fit_params=None):
        check_is_fitted(self, 'pipeline_')
        if fit_params is None:
            fit_params = {}
        self.pipeline_.steps[-1][-1].fit(X, y, **fit_params)
        return self

    def iterative_fit(self, X, y, fit_params=None, n_iter=1):
        check_is_fitted(self, 'pipeline_')
        if fit_params is None:
            fit_params = {}

        self.pipeline_.steps[-1][-1].iterative_fit(X,
                                                   y,
                                                   n_iter=n_iter,
                                                   **fit_params)

    def estimator_supports_iterative_fit(self):
        return hasattr(self.pipeline_.steps[-1][-1], 'iterative_fit')

    def configuration_fully_fitted(self):
        check_is_fitted(self, 'pipeline_')
        return self.pipeline_.steps[-1][-1].configuration_fully_fitted()

    def predict(self, X, batch_size=None):
        """Predict the classes using the selected model.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        batch_size: int or None, defaults to None
            batch_size controls whether the pipeline will be
            called on small chunks of the data. Useful when calling the
            predict method on the whole array X results in a MemoryError.

        Returns
        -------
        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
            Returns the predicted values"""

        assert hasattr(self, 'pipeline_'), "fit() must be called " \
                                           "before call predict()"

        if batch_size is None:
            return self.pipeline_.predict(X).astype(self._output_dtype)
        else:
            if type(batch_size) is not int or batch_size <= 0:
                raise Exception("batch_size must be a positive integer")

            else:
                if self.num_targets == 1:
                    y = np.zeros((X.shape[0], ), dtype=self._output_dtype)
                else:
                    y = np.zeros((X.shape[0], self.num_targets),
                                 dtype=self._output_dtype)

                # Copied and adapted from the scikit-learn GP code
                for k in range(
                        max(1, int(np.ceil(float(X.shape[0]) / batch_size)))):
                    batch_from = k * batch_size
                    batch_to = min([(k + 1) * batch_size, X.shape[0]])
                    y[batch_from:batch_to] = \
                        self.predict(X[batch_from:batch_to], batch_size=None)

                return y

    def __repr__(self):
        class_name = self.__class__.__name__

        configuration = {}
        self.configuration._populate_values()
        for hp_name in self.configuration:
            if self.configuration[hp_name] is not None:
                configuration[hp_name] = self.configuration[hp_name]

        configuration_string = ''.join([
            'configuration={\n  ', ',\n  '.join([
                "'%s': %s" % (hp_name, repr(configuration[hp_name]))
                for hp_name in sorted(configuration)
            ]), '}'
        ])

        return '%s(%s)' % (class_name, configuration_string)

    def _get_estimator_hyperparameter_name(self):
        raise NotImplementedError()