Exemple #1
0
class LARS(Classifier):
    """Least angle regression (LARS).

    LARS is the model selection algorithm from:

    Bradley Efron, Trevor Hastie, Iain Johnstone and Robert
    Tibshirani, Least Angle Regression Annals of Statistics (with
    discussion) (2004) 32(2), 407-499. A new method for variable
    subset selection, with the lasso and 'epsilon' forward stagewise
    methods as special cases.

    Similar to SMLR, it performs a feature selection while performing
    classification, but instead of starting with all features, it
    starts with none and adds them in, which is similar to boosting.

    This learner behaves more like a ridge regression in that it
    returns prediction values and it treats the training labels as
    continuous.

    In the true nature of the PyMVPA framework, this algorithm is
    actually implemented in R by Trevor Hastie and wrapped via RPy.
    To make use of LARS, you must have R and RPy installed as well as
    the LARS contributed package. You can install the R and RPy with
    the following command on Debian-based machines:

    sudo aptitude install python-rpy python-rpy-doc r-base-dev

    You can then install the LARS package by running R as root and
    calling:

    install.packages()

    """

    # XXX from yoh: it is linear, isn't it?
    __tags__ = [
        'lars', 'regression', 'linear', 'has_sensitivity',
        'does_feature_selection', 'rpy2'
    ]

    def __init__(self,
                 model_type="lasso",
                 trace=False,
                 normalize=True,
                 intercept=True,
                 max_steps=None,
                 use_Gram=False,
                 **kwargs):
        """
        Initialize LARS.

        See the help in R for further details on the following parameters:

        Parameters
        ----------
        model_type : string
          Type of LARS to run. Can be one of ('lasso', 'lar',
          'forward.stagewise', 'stepwise').
        trace : boolean
          Whether to print progress in R as it works.
        normalize : boolean
          Whether to normalize the L2 Norm.
        intercept : boolean
          Whether to add a non-penalized intercept to the model.
        max_steps : None or int
          If not None, specify the total number of iterations to run. Each
          iteration adds a feature, but leaving it none will add until
          convergence.
        use_Gram : boolean
          Whether to compute the Gram matrix (this should be false if you
          have more features than samples.)
        """
        # init base class first
        Classifier.__init__(self, **kwargs)

        if not model_type in known_models:
            raise ValueError('Unknown model %s for LARS is specified. Known' %
                             model_type + 'are %s' % repr(known_models))

        # set up the params
        self.__type = model_type
        self.__normalize = normalize
        self.__intercept = intercept
        self.__trace = trace
        self.__max_steps = max_steps
        self.__use_Gram = use_Gram

        # pylint friendly initializations
        self.__lowest_Cp_step = None
        self.__weights = None
        """The beta weights for each feature."""
        self.__trained_model = None
        """The model object after training that will be used for
        predictions."""

    def __repr__(self):
        """String summary of the object
        """
        return "LARS(type='%s', normalize=%s, intercept=%s, trace=%s, " \
               "max_steps=%s, use_Gram=%s, " \
               "enable_ca=%s)" % \
               (self.__type,
                self.__normalize,
                self.__intercept,
                self.__trace,
                self.__max_steps,
                self.__use_Gram,
                str(self.ca.enabled))

    @due.dcite(Doi('10.1214/009053604000000067'),
               path="mvpa2.clfs.lars:LARS",
               description="Least angle regression",
               tags=["implementation"])
    def _train(self, data):
        """Train the classifier using `data` (`Dataset`).
        """
        targets = data.sa[self.get_space()].value[:, np.newaxis]
        # some non-Python friendly R-lars arguments
        lars_kwargs = {'use.Gram': self.__use_Gram}
        if self.__max_steps is not None:
            lars_kwargs['max.steps'] = self.__max_steps

        trained_model = r.lars(data.samples,
                               targets,
                               type=self.__type,
                               normalize=self.__normalize,
                               intercept=self.__intercept,
                               trace=self.__trace,
                               **lars_kwargs)
        #import pydb
        #pydb.debugger()
        # find the step with the lowest Cp (risk)
        # it is often the last step if you set a max_steps
        # must first convert dictionary to array
        Cp_vals = None
        try:
            Cp_vals = np.asanyarray(Rrx2(trained_model, 'Cp'))
        except TypeError as e:
            raise FailedToTrainError("Failed to train %s on %s. Got '%s' while trying to access " \
                  "trained model %s" % (self, data, e, trained_model))

        if Cp_vals is None:
            # if there were no any -- just choose 0th
            lowest_Cp_step = 0
        elif np.isnan(Cp_vals[0]):
            # sometimes may come back nan, so just pick the last one
            lowest_Cp_step = len(Cp_vals) - 1
        else:
            # determine the lowest
            lowest_Cp_step = Cp_vals.argmin()

        self.__lowest_Cp_step = lowest_Cp_step
        # set the weights to the lowest Cp step
        self.__weights = np.asanyarray(Rrx2(trained_model,
                                            'beta'))[lowest_Cp_step]

        self.__trained_model = trained_model  # bind to an instance


#         # set the weights to the final state
#         self.__weights = self.__trained_model['beta'][-1,:]

    @accepts_dataset_as_samples
    def _predict(self, data):
        """
        Predict the output for the provided data.
        """
        # predict with the final state (i.e., the last step)
        # predict with the lowest Cp step
        try:
            res = r.predict(self.__trained_model,
                            data,
                            mode='step',
                            s=self.__lowest_Cp_step)
            #s=self.__trained_model['beta'].shape[0])
            fit = np.atleast_1d(Rrx2(res, 'fit'))
        except RRuntimeError as e:
            raise FailedToPredictError("Failed to predict on %s using %s. Exceptions was: %s" \
                  % (data, self, e))

        self.ca.estimates = fit
        return fit

    def _init_internals(self):
        """Reinitialize all internals
        """
        self.__lowest_Cp_step = None
        self.__weights = None
        """The beta weights for each feature."""
        self.__trained_model = None
        """The model object after training that will be used for
        predictions."""

    def _untrain(self):
        super(LARS, self)._untrain()
        self._init_internals()

    ##REF: Name was automagically refactored
    def _get_feature_ids(self):
        """Return ids of the used features
        """
        return np.where(np.abs(self.__weights) > 0)[0]

    ##REF: Name was automagically refactored
    def get_sensitivity_analyzer(self, **kwargs):
        """Returns a sensitivity analyzer for LARS."""
        return LARSWeights(self, **kwargs)

    weights = property(lambda self: self.__weights)
Exemple #2
0
class RFE(IterativeFeatureSelection):
    """Recursive feature elimination.

    A `FeaturewiseMeasure` is used to compute sensitivity maps given a
    certain dataset. These sensitivity maps are in turn used to discard
    unimportant features. For each feature selection the transfer error on some
    testdatset is computed. This procedure is repeated until a given
    `StoppingCriterion` is reached.

    References
    ----------
    Such strategy after
      Guyon, I., Weston, J., Barnhill, S., & Vapnik, V. (2002). Gene
      selection for cancer classification using support vector
      machines. Mach. Learn., 46(1-3), 389--422.
    was applied to SVM-based analysis of fMRI data in
      Hanson, S. J. & Halchenko, Y. O. (2008). Brain reading using
      full brain support vector machines for object recognition:
      there is no "face identification area". Neural Computation, 20,
      486--503.

    Examples
    --------

    There are multiple possible ways to design an RFE.  Here is one
    example which would rely on a SplitClassifier to extract
    sensitivities and provide estimate of performance (error)

    >>> # Lazy import
    >>> from mvpa2.suite import *
    >>> rfesvm_split = SplitClassifier(LinearCSVMC(), OddEvenPartitioner())
    >>> # design an RFE feature selection to be used with a classifier
    >>> rfe = RFE(rfesvm_split.get_sensitivity_analyzer(
    ...              # take sensitivities per each split, L2 norm, mean, abs them
    ...              postproc=ChainMapper([ FxMapper('features', l2_normed),
    ...                                     FxMapper('samples', np.mean),
    ...                                     FxMapper('samples', np.abs)])),
    ...           # use the error stored in the confusion matrix of split classifier
    ...           ConfusionBasedError(rfesvm_split, confusion_state='stats'),
    ...           # we just extract error from confusion, so no need to split dataset
    ...           Repeater(2),
    ...           # select 50% of the best on each step
    ...           fselector=FractionTailSelector(
    ...               0.50,
    ...               mode='select', tail='upper'),
    ...           # and stop whenever error didn't improve for up to 10 steps
    ...           stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
    ...           # we just extract it from existing confusion
    ...           train_pmeasure=False,
    ...           # but we do want to update sensitivities on each step
    ...           update_sensitivity=True)
    >>> clf = FeatureSelectionClassifier(
    ...           LinearCSVMC(),
    ...           # on features selected via RFE
    ...           rfe,
    ...           # custom description
    ...           descr='LinSVM+RFE(splits_avg)' )
    
    Note: If you rely on cross-validation for the StoppingCriterion, make sure
    that you have at least 3 chunks so that SplitClassifier could have at least
    2 chunks to split. Otherwise it can not split more (one chunk could not be
    splitted).

    """

    history = ConditionalAttribute(
        doc="Last step # when each feature was still present")
    sensitivities = ConditionalAttribute(
        enabled=False,
        doc="History of sensitivities (might consume too much memory")

    def __init__(self,
                 fmeasure,
                 pmeasure,
                 splitter,
                 fselector=FractionTailSelector(0.05),
                 update_sensitivity=True,
                 nfeatures_min=0,
                 **kwargs):
        # XXX Allow for multiple stopping criterions, e.g. error not decreasing
        # anymore OR number of features less than threshold
        """Initialize recursive feature elimination

        Parameters
        ----------
        fmeasure : FeaturewiseMeasure
        pmeasure : Measure
          used to compute the transfer error of a classifier based on a
          certain feature set on the test dataset.
          NOTE: If sensitivity analyzer is based on the same
          classifier as transfer_error is using, make sure you
          initialize transfer_error with train=False, otherwise
          it would train classifier twice without any necessity.
        splitter: Splitter
          This splitter instance has to generate at least two dataset splits
          when called with the input dataset. The first split serves as the
          training dataset and the second as the evaluation dataset.
        fselector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        update_sensitivity : bool
          If False the sensitivity map is only computed once and reused
          for each iteration. Otherwise the sensitivities are
          recomputed at each selection step.
        nfeatures_min : int
          Number of features for RFE to stop if reached.
        """
        # bases init first
        IterativeFeatureSelection.__init__(self, fmeasure, pmeasure, splitter,
                                           fselector, **kwargs)

        self.__update_sensitivity = update_sensitivity
        """Flag whether sensitivity map is recomputed for each step."""

        self._nfeatures_min = nfeatures_min

    def __repr__(self, prefixes=None):
        if prefixes is None:
            prefixes = []
        return super(RFE, self).__repr__(
            prefixes=prefixes +
            _repr_attrs(self, ['update_sensitivity'], default=True))

    @due.dcite(BibTeX("""
@Article{ GWB+02,
    author = "I. Guyon and J. Weston and S. Barnhill and V. Vapnik",
    title = "Gene Selection for Cancer Classification using Support Vector Machines",
    volume = "46",
    year = "2002",
    pages = "389--422",
    publisher = "Kluwer",
    address = "Hingham, MA, USA",
    journal = "Machine Learning"
}"""),
               description="Recursive feature elimination procedure",
               tags=["implementation"])
    @due.dcite(Doi("10.1162/neco.2007.09-06-340"),
               description="Full-brain fMRI decoding using SVM RFE",
               tags=["use"])
    def _train(self, ds):
        """Proceed and select the features recursively eliminating less
        important ones.

        Parameters
        ----------
        ds : Dataset
          used to compute sensitivity maps and train a classifier
          to determine the transfer error
        """
        # get the initial split into train and test
        dataset, testdataset = self._get_traintest_ds(ds)

        if __debug__:
            debug('RFEC',
                  "Initiating RFE with training on %s and testing using %s",
                  (dataset, testdataset))
        errors = []
        """Computed error for each tested features set."""

        ca = self.ca
        ca.nfeatures = []
        """Number of features at each step. Since it is not used by the
        algorithm it is stored directly in the conditional attribute"""

        ca.history = np.arange(dataset.nfeatures)
        """Store the last step # when the feature was still present
        """

        ca.sensitivities = []

        stop = False
        """Flag when RFE should be stopped."""

        results = None
        """Will hold the best feature set ever."""

        wdataset = dataset
        """Operate on working dataset initially identical."""

        wtestdataset = testdataset
        """Same feature selection has to be performs on test dataset as well.
        This will hold the current testdataset."""

        step = 0
        """Counter how many selection step where done."""

        orig_feature_ids = np.arange(dataset.nfeatures)
        """List of feature Ids as per original dataset remaining at any given
        step"""

        sensitivity = None
        """Contains the latest sensitivity map."""

        result_selected_ids = orig_feature_ids
        """Resultant ids of selected features. Since the best is not
        necessarily is the last - we better keep this one around. By
        default -- all features are there"""
        selected_ids = result_selected_ids

        isthebest = True
        """By default (e.g. no errors even estimated) every step is the best one
        """

        while wdataset.nfeatures > 0:

            if __debug__:
                debug('RFEC',
                      "Step %d: nfeatures=%d" % (step, wdataset.nfeatures))

            # mark the features which are present at this step
            # if it brings anyb mentionable computational burden in the future,
            # only mark on removed features at each step
            ca.history[orig_feature_ids] = step

            # Compute sensitivity map
            if self.__update_sensitivity or sensitivity == None:
                sensitivity = self._fmeasure(wdataset)
                if len(sensitivity) > 1:
                    raise ValueError(
                        "RFE cannot handle multiple sensitivities at once. "
                        "'%s' returned %i sensitivities." %
                        (self._fmeasure.__class__.__name__, len(sensitivity)))

            if ca.is_enabled("sensitivities"):
                ca.sensitivities.append(sensitivity)

            if self._pmeasure:
                # get error for current feature set (handles optional retraining)
                error = np.asscalar(
                    self._evaluate_pmeasure(wdataset, wtestdataset))
                # Record the error
                errors.append(error)

                # Check if it is time to stop and if we got
                # the best result
                if self._stopping_criterion is not None:
                    stop = self._stopping_criterion(errors)
                if self._bestdetector is not None:
                    isthebest = self._bestdetector(errors)
            else:
                error = None

            nfeatures = wdataset.nfeatures

            if ca.is_enabled("nfeatures"):
                ca.nfeatures.append(wdataset.nfeatures)

            # store result
            if isthebest:
                result_selected_ids = orig_feature_ids

            if __debug__:
                debug(
                    'RFEC', "Step %d: nfeatures=%d error=%s best/stop=%d/%d " %
                    (step, nfeatures, error, isthebest, stop))

            # stop if it is time to finish
            if nfeatures == 1 or nfeatures <= self.nfeatures_min or stop:
                break

            # Select features to preserve
            selected_ids = self._fselector(sensitivity)

            if __debug__:
                debug(
                    'RFEC_',
                    "Sensitivity: %s, nfeatures_selected=%d, selected_ids: %s"
                    % (sensitivity, len(selected_ids), selected_ids))

            # Create a dataset only with selected features
            wdataset = wdataset[:, selected_ids]

            # select corresponding sensitivity values if they are not
            # recomputed
            if not self.__update_sensitivity:
                if len(sensitivity.shape) >= 2:
                    assert (sensitivity.shape[0] == 1
                            )  # there must be only 1 sample
                    sensitivity = sensitivity[:, selected_ids]
                else:
                    sensitivity = sensitivity[selected_ids]

            # need to update the test dataset as well
            # XXX why should it ever become None?
            # yoh: because we can have __transfer_error computed
            #      using wdataset. See xia-generalization estimate
            #      in lightsvm. Or for god's sake leave-one-out
            #      on a wdataset
            # TODO: document these cases in this class
            if testdataset is not None:
                wtestdataset = wtestdataset[:, selected_ids]

            step += 1

            # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids
            selected_ids.sort()
            if self.ca.is_enabled("history") \
                   or self.ca.is_enabled('selected_ids'):
                orig_feature_ids = orig_feature_ids[selected_ids]

            # we already have the initial sensitivities, so even for a shared
            # classifier we can cleanup here
            if self._pmeasure:
                self._pmeasure.untrain()

        # charge conditional attributes
        self.ca.errors = errors
        self.ca.selected_ids = result_selected_ids
        if __debug__:
            debug('RFEC', "Selected %d features: %s",
                  (len(result_selected_ids), result_selected_ids))

        # announce desired features to the underlying slice mapper
        # do copy to survive later selections
        self._safe_assign_slicearg(copy(result_selected_ids))
        # call super to set _Xshape etc
        super(RFE, self)._train(dataset)

    def _untrain(self):
        super(RFE, self)._untrain()
        if self._pmeasure:
            self._pmeasure.untrain()
        if self._fmeasure:
            self._fmeasure.untrain()

    def _get_nfeatures_min(self):
        return self._nfeatures_min

    def _set_nfeatures_min(self, v):
        if self.is_trained:
            self.untrain()
        if v < 0:
            raise ValueError("nfeatures_min must not be negative. Got %s" % v)
        self._nfeatures_min = v

    nfeatures_min = property(fget=_get_nfeatures_min, fset=_set_nfeatures_min)
    update_sensitivity = property(fget=lambda self: self.__update_sensitivity)
Exemple #3
0
class SMLR(Classifier):
    """Sparse Multinomial Logistic Regression `Classifier`.

    This is an implementation of the SMLR algorithm published in
    :ref:`Krishnapuram et al., 2005 <KCF+05>` (2005, IEEE Transactions
    on Pattern Analysis and Machine Intelligence).  Be sure to cite
    that article if you use this classifier for your work.
    """

    __tags__ = [
        'smlr', 'linear', 'has_sensitivity', 'binary', 'multiclass',
        'oneclass', 'does_feature_selection', 'random_tie_breaking'
    ]
    # XXX: later 'kernel-based'?

    lm = Parameter(
        .1,
        constraints=EnsureFloat() & EnsureRange(min=1e-10),
        doc="""The penalty term lambda.  Larger values will give rise to
             more sparsification.""")

    convergence_tol = Parameter(
        1e-3,
        constraints=EnsureFloat() & EnsureRange(min=1e-10, max=1.0),
        doc="""When the weight change for each cycle drops below this value
             the regression is considered converged.  Smaller values
             lead to tighter convergence.""")

    resamp_decay = Parameter(
        0.5,
        constraints=EnsureFloat() & EnsureRange(min=0.0, max=1.0),
        doc="""Decay rate in the probability of resampling a zero weight.
             1.0 will immediately decrease to the min_resamp from 1.0, 0.0
             will never decrease from 1.0.""")

    min_resamp = Parameter(
        0.001,
        constraints=EnsureFloat() & EnsureRange(min=1e-10, max=1.0),
        doc="Minimum resampling probability for zeroed weights")

    maxiter = Parameter(
        10000,
        constraints=EnsureInt() & EnsureRange(min=1),
        doc="""Maximum number of iterations before stopping if not
             converged.""")

    has_bias = Parameter(
        True,
        constraints='bool',
        doc="""Whether to add a bias term to allow fits to data not through
             zero""")

    fit_all_weights = Parameter(
        True,
        constraints='bool',
        doc="""Whether to fit weights for all classes or to the number of
            classes minus one.  Both should give nearly identical results, but
            if you set fit_all_weights to True it will take a little longer
            and yield weights that are fully analyzable for each class.  Also,
            note that the convergence rate may be different, but convergence
            point is the same.""")

    implementation = Parameter(_DEFAULT_IMPLEMENTATION,
                               constraints=EnsureChoice('C', 'Python'),
                               doc="""Use C or Python as the implementation of
             stepwise_regression. C version brings significant speedup thus is
             the default one.""")

    ties = Parameter('random',
                     constraints='str',
                     doc="""Resolve ties which could occur.  At the moment
                     only obvious ties resulting in identical weights
                     per two classes are detected and resolved
                     randomly by injecting small amount of noise into
                     the estimates of tied categories.
                     Set to False to avoid this behavior""")

    seed = Parameter(
        _random_seed,
        constraints=EnsureNone() | EnsureInt(),
        doc="""Seed to be used to initialize random generator, might be
             used to replicate the run""")

    unsparsify = Parameter(
        False,
        constraints='bool',
        doc="""***EXPERIMENTAL*** Whether to unsparsify the weights via
             regression. Note that it likely leads to worse classifier
             performance, but more interpretable weights.""")

    std_to_keep = Parameter(
        2.0,
        constraints='float',
        doc="""Standard deviation threshold of weights to keep when
             unsparsifying.""")

    def __init__(self, **kwargs):
        """Initialize an SMLR classifier.
        """
        """
        TODO:
         # Add in likelihood calculation
         # Add kernels, not just direct methods.
         """
        # init base class first
        Classifier.__init__(self, **kwargs)

        if _cStepwiseRegression is None and self.params.implementation == 'C':
            warning('SMLR: C implementation is not available.'
                    ' Using pure Python one')
            self.params.implementation = 'Python'

        # pylint friendly initializations
        self._ulabels = None
        """Unigue labels from the training set."""
        self.__weights_all = None
        """Contains all weights including bias values"""
        self.__weights = None
        """Just the weights, without the biases"""
        self.__biases = None
        """The biases, will remain none if has_bias is False"""

    ##REF: Name was automagically refactored
    def _python_stepwise_regression(self,
                                    w,
                                    X,
                                    XY,
                                    Xw,
                                    E,
                                    auto_corr,
                                    lambda_over_2_auto_corr,
                                    S,
                                    M,
                                    maxiter,
                                    convergence_tol,
                                    resamp_decay,
                                    min_resamp,
                                    verbose,
                                    seed=None):
        """The (much slower) python version of the stepwise
        regression.  I'm keeping this around for now so that we can
        compare results."""

        # get the data information into easy vars
        ns, nd = X.shape

        # initialize the iterative optimization
        converged = False
        incr = np.finfo(np.float).max
        non_zero, basis, m, wasted_basis, cycles = 0, 0, 0, 0, 0
        sum2_w_diff, sum2_w_old, w_diff = 0.0, 0.0, 0.0
        p_resamp = np.ones(w.shape, dtype=np.float)

        if seed is not None:
            # set the random seed
            np.random.seed(seed)

            if __debug__:
                debug("SMLR_", "random seed=%s" % seed)

        # perform the optimization
        while not converged and cycles < maxiter:
            # get the starting weight
            w_old = w[basis, m]

            # see if we're gonna update
            if (w_old != 0) or np.random.rand() < p_resamp[basis, m]:
                # let's do it
                # get the probability
                P = E[:, m] / S

                # set the gradient
                grad = XY[basis, m] - np.dot(X[:, basis], P)

                # calculate the new weight with the Laplacian prior
                w_new = w_old + grad / auto_corr[basis]

                # keep weights within bounds
                if w_new > lambda_over_2_auto_corr[basis]:
                    w_new -= lambda_over_2_auto_corr[basis]
                    changed = True
                    # unmark from being zero if necessary
                    if w_old == 0:
                        non_zero += 1
                        # reset the prob of resampling
                        p_resamp[basis, m] = 1.0
                elif w_new < -lambda_over_2_auto_corr[basis]:
                    w_new += lambda_over_2_auto_corr[basis]
                    changed = True
                    # unmark from being zero if necessary
                    if w_old == 0:
                        non_zero += 1
                        # reset the prob of resampling
                        p_resamp[basis, m] = 1.0
                else:
                    # gonna zero it out
                    w_new = 0.0

                    # decrease the p_resamp
                    p_resamp[basis, m] -= (p_resamp[basis, m] - \
                                           min_resamp) * resamp_decay

                    # set number of non-zero
                    if w_old == 0:
                        changed = False
                        wasted_basis += 1
                    else:
                        changed = True
                        non_zero -= 1

                # process any changes
                if changed:
                    #print "w[%d, %d] = %g" % (basis, m, w_new)
                    # update the expected values
                    w_diff = w_new - w_old
                    Xw[:, m] = Xw[:, m] + X[:, basis] * w_diff
                    E_new_m = np.exp(Xw[:, m])
                    S += E_new_m - E[:, m]
                    E[:, m] = E_new_m

                    # update the weight
                    w[basis, m] = w_new

                    # keep track of the sqrt sum squared diffs
                    sum2_w_diff += w_diff * w_diff

                # add to the old no matter what
                sum2_w_old += w_old * w_old

            # update the class and basis
            m = np.mod(m + 1, w.shape[1])
            if m == 0:
                # we completed a cycle of labels
                basis = np.mod(basis + 1, nd)
                if basis == 0:
                    # we completed a cycle of features
                    cycles += 1

                    # assess convergence
                    incr = np.sqrt(sum2_w_diff) / \
                           (np.sqrt(sum2_w_old)+np.finfo(np.float).eps)

                    # save the new weights
                    converged = incr < convergence_tol

                    if __debug__:
                        debug("SMLR_", \
                              "cycle=%d ; incr=%g ; non_zero=%d ; " %
                              (cycles, incr, non_zero) +
                              "wasted_basis=%d ; " %
                              (wasted_basis) +
                              "sum2_w_old=%g ; sum2_w_diff=%g" %
                              (sum2_w_old, sum2_w_diff))

                    # reset the sum diffs and wasted_basis
                    sum2_w_diff = 0.0
                    sum2_w_old = 0.0
                    wasted_basis = 0

        if not converged:
            raise ConvergenceError("More than %d Iterations without convergence" % \
                (maxiter))

        # calcualte the log likelihoods and posteriors for the training data
        #log_likelihood = x

        return cycles

    @due.dcite(Doi('10.1109/TPAMI.2005.127'),
               path="mvpa2.clfs.smlr:SMLR",
               description="Sparse multinomial-logistic regression classifier",
               tags=["implementation"])
    def _train(self, dataset):
        """Train the classifier using `dataset` (`Dataset`).
        """
        targets_sa_name = self.get_space()  # name of targets sa
        targets_sa = dataset.sa[targets_sa_name]  # actual targets sa

        # Process the labels to turn into 1 of N encoding
        uniquelabels = targets_sa.unique
        labels = _label2oneofm(targets_sa.value, uniquelabels)
        self._ulabels = uniquelabels.copy()

        Y = labels
        M = len(self._ulabels)

        # get the dataset information into easy vars
        X = dataset.samples

        # see if we are adding a bias term
        if self.params.has_bias:
            if __debug__:
                debug("SMLR_", "hstacking 1s for bias")

            # append the bias term to the features
            X = np.hstack((X, np.ones((X.shape[0], 1), dtype=X.dtype)))

        if self.params.implementation.upper() == 'C':
            _stepwise_regression = _cStepwiseRegression
            #
            # TODO: avoid copying to non-contig arrays, use strides in ctypes?
            if not (X.flags['C_CONTIGUOUS'] and X.flags['ALIGNED']):
                if __debug__:
                    debug("SMLR_",
                          "Copying data to get it C_CONTIGUOUS/ALIGNED")
                X = np.array(X, copy=True, dtype=np.double, order='C')

            # currently must be double for the C code
            if X.dtype != np.double:
                if __debug__:
                    debug("SMLR_", "Converting data to double")
                # must cast to double
                X = X.astype(np.double)

        # set the feature dimensions
        elif self.params.implementation.upper() == 'PYTHON':
            _stepwise_regression = self._python_stepwise_regression
        else:
            raise ValueError(
                "Unknown implementation %s of stepwise_regression" %
                self.params.implementation)

        # set the feature dimensions
        ns, nd = X.shape

        # decide the size of weights based on num classes estimated
        if self.params.fit_all_weights:
            c_to_fit = M
        else:
            c_to_fit = M - 1

        # Precompute what we can
        auto_corr = ((M - 1.) / (2. * M)) * (np.sum(X * X, 0))
        XY = np.dot(X.T, Y[:, :c_to_fit])
        lambda_over_2_auto_corr = (self.params.lm / 2.) / auto_corr

        # set starting values
        w = np.zeros((nd, c_to_fit), dtype=np.double)
        Xw = np.zeros((ns, c_to_fit), dtype=np.double)
        E = np.ones((ns, c_to_fit), dtype=np.double)
        S = M * np.ones(ns, dtype=np.double)

        # set verbosity
        if __debug__:
            verbosity = int("SMLR_" in debug.active)
            debug('SMLR_',
                  'Calling stepwise_regression. Seed %s' % self.params.seed)
        else:
            verbosity = 0

        # call the chosen version of stepwise_regression
        cycles = _stepwise_regression(
            w, X, XY, Xw, E, auto_corr, lambda_over_2_auto_corr, S, M,
            self.params.maxiter, self.params.convergence_tol,
            self.params.resamp_decay, self.params.min_resamp, verbosity,
            self.params.seed)

        if cycles >= self.params.maxiter:
            # did not converge
            raise ConvergenceError(
                "More than %d iterations without convergence" %
                self.params.maxiter)

        # see if unsparsify the weights
        if self.params.unsparsify:
            # unsparsify
            w = self._unsparsify_weights(X, w)

        # resolve ties if present
        self.__ties = None
        if self.params.ties:
            if self.params.ties == 'random':
                # check if there is a tie showing itself as absent
                # difference between two w's
                wdot = np.dot(w.T, -w)
                ties = np.where(np.max(np.abs(wdot), axis=0) == 0)[0]
                if len(ties):
                    warning("SMLR: detected ties in categories %s.  Small "
                            "amount of noise will be injected into result "
                            "estimates upon prediction to break the ties" %
                            self._ulabels[ties])
                    self.__ties = ties
                    ## w_non0 = np.nonzero(w)
                    ## w_non0_max = np.max(np.abs(w[w_non0]))
                    ## w_non0_idx = np.unique(w_non0[0])
                    ## w_non0_len = len(w_non0_idx)
                    ## for f_idx in np.where(ties)[0]:
                    ##     w[w_non0_idx, f_idx] += \
                    ##          0.001 * np.random.normal(size=(w_non0_len,))
            else:
                raise ValueError("Do not know how to treat ties=%r" %
                                 (self.params.ties, ))

        # save the weights
        self.__weights_all = w
        self.__weights = w[:dataset.nfeatures, :]

        if self.ca.is_enabled('feature_ids'):
            self.ca.feature_ids = np.where(
                np.max(np.abs(w[:dataset.nfeatures, :]), axis=1) > 0)[0]

        # and a bias
        if self.params.has_bias:
            self.__biases = w[-1, :]

        if __debug__:
            debug(
                'SMLR', "train finished in %d cycles on data.shape=%s " %
                (cycles, X.shape) +
                "min:max(data)=%f:%f, got min:max(w)=%f:%f" %
                (np.min(X), np.max(X), np.min(w), np.max(w)))

    def _unsparsify_weights(self, samples, weights):
        """Unsparsify weights via least squares regression."""
        # allocate for the new weights
        new_weights = np.zeros(weights.shape, dtype=np.double)

        # get the sample data we're predicting and the sum squared
        # total variance
        b = samples
        sst = np.power(b - b.mean(0), 2).sum(0)

        # loop over each column
        for i in range(weights.shape[1]):
            w = weights[:, i]

            # get the nonzero ind
            ind = w != 0

            # get the features with non-zero weights
            a = b[:, ind]

            # predict all the data with the non-zero features
            betas = np.linalg.lstsq(a, b)[0]

            # determine the R^2 for each feature based on the sum
            # squared prediction error
            f = np.dot(a, betas)
            sse = np.power((b - f), 2).sum(0)
            rsquare = np.zeros(sse.shape, dtype=sse.dtype)
            gind = sst > 0
            rsquare[gind] = 1 - (sse[gind] / sst[gind])

            # derrive new weights by combining the betas and weights
            # scaled by the rsquare
            new_weights[:, i] = np.dot(w[ind], betas) * rsquare

        # take the tails
        tozero = np.abs(
            new_weights) < self.params.std_to_keep * np.std(new_weights)
        orig_zero = weights == 0.0
        if orig_zero.sum() < tozero.sum():
            # should not end up with fewer than start
            tozero = orig_zero
        new_weights[tozero] = 0.0

        if __debug__:
            debug(
                'SMLR_', "Start nonzero: %d; Finish nonzero: %d" %
                ((weights != 0).sum(), (new_weights != 0).sum()))

        return new_weights

    ##REF: Name was automagically refactored
    def _get_feature_ids(self):
        """Return ids of the used features
        """
        return np.where(np.max(np.abs(self.__weights), axis=1) > 0)[0]

    @accepts_dataset_as_samples
    def _predict(self, data):
        """Predict the output for the provided data.
        """
        # see if we are adding a bias term
        if self.params.has_bias:
            # append the bias term to the features
            data = np.hstack(
                (data, np.ones((data.shape[0], 1), dtype=data.dtype)))

        # append the zeros column to the weights if necessary
        if self.params.fit_all_weights:
            w = self.__weights_all
        else:
            w = np.hstack(
                (self.__weights_all, np.zeros(
                    (self.__weights_all.shape[0], 1))))

        # determine the probability values for making the prediction
        dot_prod = np.dot(data, w)
        E = np.exp(dot_prod)
        if self.__ties is not None:
            # 1e-5 should be adequate since anyways this is done
            # already after exponentiation
            E[:, self.__ties] += \
                 1e-5 * np.random.normal(size=(len(E), len(self.__ties)))
        S = np.sum(E, 1)

        if __debug__:
            debug(
                'SMLR', "predict on data.shape=%s min:max(data)=%f:%f " %
                (repr(data.shape), np.min(data), np.max(data)) +
                "min:max(w)=%f:%f min:max(dot_prod)=%f:%f min:max(E)=%f:%f" %
                (np.min(w), np.max(w), np.min(dot_prod), np.max(dot_prod),
                 np.min(E), np.max(E)))
        values = E / S[:, np.newaxis]  #.repeat(E.shape[1], axis=1)
        self.ca.estimates = values

        # generate predictions
        predictions = np.asarray(
            [self._ulabels[np.argmax(vals)] for vals in values])
        # no need to assign conditional attribute here -- would be done
        # in Classifier._postpredict anyway
        #self.predictions = predictions

        return predictions

    ##REF: Name was automagically refactored
    def get_sensitivity_analyzer(self, **kwargs):
        """Returns a sensitivity analyzer for SMLR."""
        return SMLRWeights(self, **kwargs)

    biases = property(lambda self: self.__biases)
    weights = property(lambda self: self.__weights)
Exemple #4
0
class BaseSearchlight(Measure):
    """Base class for searchlights.

    The idea for a searchlight algorithm stems from a paper by
    :ref:`Kriegeskorte et al. (2006) <KGB06>`.
    """

    roi_sizes = ConditionalAttribute(enabled=False,
                                     doc="Number of features in each ROI.")

    roi_feature_ids = ConditionalAttribute(
        enabled=False, doc="Feature IDs for all generated ROIs.")

    roi_center_ids = ConditionalAttribute(
        enabled=True, doc="Center ID for all generated ROIs.")

    is_trained = True
    """Indicate that this measure is always trained."""
    def __init__(self, queryengine, roi_ids=None, nproc=None, **kwargs):
        """
        Parameters
        ----------
        queryengine : QueryEngine
          Engine to use to discover the "neighborhood" of each feature.
          See :class:`~mvpa2.misc.neighborhood.QueryEngine`.
        roi_ids : None or list(int) or str
          List of query engine ids (e.g.,  feature ids, not coordinates, in case
          of `IndexQueryEngine`; and `node_indices` in case of
          `SurfaceQueryEngine`) that shall serve as ROI seeds
          (e.g., sphere centers). Alternatively, this can be the name of a
          feature attribute of the input dataset, whose non-zero values
          determine the feature ids (be careful to use it only with
          `IndexQueryEngine`).  By default all query engine ids will be used.
        nproc : None or int
          How many processes to use for computation.  Requires `pprocess`
          external module.  If None -- all available cores will be used.
        **kwargs
          In addition this class supports all keyword arguments of its
          base-class :class:`~mvpa2.measures.base.Measure`.
      """
        Measure.__init__(self, **kwargs)

        if nproc is not None and nproc > 1 and not externals.exists(
                'pprocess'):
            raise RuntimeError("The 'pprocess' module is required for "
                               "multiprocess searchlights. Please either "
                               "install python-pprocess, or reduce `nproc` "
                               "to 1 (got nproc=%i) or set to default None" %
                               nproc)

        self._queryengine = queryengine
        if roi_ids is not None and not isinstance(roi_ids, str) \
                and not len(roi_ids):
            raise ValueError, \
                  "Cannot run searchlight on an empty list of roi_ids"
        self.__roi_ids = roi_ids
        self.nproc = nproc

    def __repr__(self, prefixes=None):
        """String representation of a `Measure`

        Includes only arguments which differ from default ones
        """
        if prefixes is None:
            prefixes = []
        return super(BaseSearchlight, self).__repr__(
            prefixes=prefixes +
            _repr_attrs(self, ['queryengine', 'roi_ids', 'nproc']))

    @due.dcite(Doi('10.1073/pnas.0600244103'),
               description="Searchlight analysis approach",
               tags=["implementation"])
    @due.dcite(
        Doi('10.1038/nrn1931'),
        description=
        "Application of the searchlight approach to decoding using classifiers",
        tags=["use"])
    def _call(self, dataset):
        """Perform the ROI search.
        """
        # local binding
        nproc = self.nproc

        if nproc is None and externals.exists('pprocess'):
            import pprocess
            if on_osx:
                warning("Unable to determine automatically maximal number of "
                        "cores on Mac OS X. Using 1")
                nproc = 1
            else:
                try:
                    nproc = pprocess.get_number_of_cores() or 1
                except AttributeError:
                    warning(
                        "pprocess version %s has no API to figure out maximal "
                        "number of cores. Using 1" %
                        externals.versions['pprocess'])
                    nproc = 1
        # train the queryengine
        self._queryengine.train(dataset)

        # decide whether to run on all possible center coords or just a provided
        # subset
        if isinstance(self.__roi_ids, str):
            roi_ids = dataset.fa[self.__roi_ids].value.nonzero()[0]
        elif self.__roi_ids is not None:
            roi_ids = self.__roi_ids
            # safeguard against stupidity
            if __debug__:
                qe_ids = self._queryengine.ids  # known to qe
                if not set(qe_ids).issuperset(roi_ids):
                    raise IndexError(
                        "Some roi_ids are not known to the query engine %s: %s"
                        % (self._queryengine, set(roi_ids).difference(qe_ids)))
        else:
            roi_ids = self._queryengine.ids

        # pass to subclass
        results = self._sl_call(dataset, roi_ids, nproc)

        if 'mapper' in dataset.a:
            # since we know the space we can stick the original mapper into the
            # results as well
            if self.__roi_ids is None:
                results.a['mapper'] = copy.copy(dataset.a.mapper)
            else:
                # there is an additional selection step that needs to be
                # expressed by another mapper
                mapper = copy.copy(dataset.a.mapper)

                # NNO if the orignal mapper has no append (because it's not a
                # chainmapper, for example), we make our own chainmapper.
                #
                # THe original code was:
                # mapper.append(StaticFeatureSelection(roi_ids,
                #                                     dshape=dataset.shape[1:]))
                feat_sel_mapper = StaticFeatureSelection(
                    roi_ids, dshape=dataset.shape[1:])
                if 'append' in dir(mapper):
                    mapper.append(feat_sel_mapper)
                else:
                    mapper = ChainMapper([dataset.a.mapper, feat_sel_mapper])

                results.a['mapper'] = mapper

        # charge state
        self.ca.raw_results = results
        # return raw results, base-class will take care of transformations
        return results

    def _sl_call(self, dataset, roi_ids, nproc):
        """Classical generic searchlight implementation
        """
        raise NotImplementedError("Must be implemented in the derived classes")

    queryengine = property(fget=lambda self: self._queryengine)
    roi_ids = property(fget=lambda self: self.__roi_ids)
Exemple #5
0
class GroupClusterThreshold(Learner):
    """Statistical evaluation of group-level average accuracy maps

    This algorithm can be used to perform cluster-thresholding of
    searchlight-based group analyses. It implements a two-stage procedure that
    uses the results of within-subject permutation analyses, estimates a per
    feature cluster forming threshold (via bootstrap), and uses the thresholded
    bootstrap samples to estimate the distribution of cluster sizes in
    group-average accuracy maps under the NULL hypothesis, as described in [1]_.

    Note: this class implements a modified version of that algorithm. The
    present implementation differs in, at least, four aspects from the
    description in that paper.

    1) Cluster p-values refer to the probability of observing a particular
       cluster size or a larger one (original paper: probability to observe a
       larger cluster only).  Consequently, probabilities reported by this
       implementation will have a tendency to be higher in comparison.

    2) Clusters found in the original (unpermuted) accuracy map are always
       included in the NULL distribution estimate of cluster sizes. This
       provides an explicit lower bound for probabilities, as there will
       always be at least one observed cluster for every cluster size found
       in the original accuracy map. Consequently, it is impossible to get a
       probability of zero for clusters of any size (see [2] for more
       information).

    3) Bootstrap accuracy maps that contain no clusters are counted in a
       dedicated size-zero bin in the NULL distribution of cluster sizes.
       This change yields reliable cluster-probabilities even for very low
       featurewise threshold probabilities, where (some portion) of the
       bootstrap accuracy maps do not contain any clusters.

    4) The method for FWE-correction used by the original authors is not
       provided. Instead, a range of alternatives implemented by the
       statsmodels package are available.

    Moreover, this implementation minimizes the required memory demands and
    allows for computing large numbers of bootstrap samples without
    significant increase in memory demand (CPU time trade-off).

    Instances of this class must be trained before than can be used to
    threshold accuracy maps. The training dataset must match the following
    criteria:

    1) For every subject in the group, it must contain multiple accuracy maps
       that are the result of a within-subject classification analysis
       based on permuted class labels. One map must corresponds to one fixed
       permutation for all features in the map, as described in [1]_. The
       original authors recommend 100 accuracy maps per subject for a typical
       searchlight analysis.

    2) It must contain a sample attribute indicating which sample is
       associated with which subject, because bootstrapping average accuracy
       maps is implemented by randomly drawing one map from each subject.
       The name of the attribute can be configured via the ``chunk_attr``
       parameter.

    After training, an instance can be called with a dataset to perform
    threshold and statistical evaluation. Unless a single-sample dataset
    is passed, all samples in the input dataset will be averaged prior
    thresholding.

    Returns
    -------
    Dataset
      This is a shallow copy of the input dataset (after a potential
      averaging), hence contains the same data and attributes. In addition it
      includes the following attributes:

      ``fa.featurewise_thresh``
        Vector with feature-wise cluster-forming thresholds.

      ``fa.clusters_featurewise_thresh``
        Vector with labels for clusters after thresholding the input data
        with the desired feature-wise probability. Each unique non-zero
        element corresponds to an individual super-threshold cluster. Cluster
        values are sorted by cluster size (number of features). The largest
        cluster is always labeled with ``1``.

      ``fa.clusters_fwe_thresh``
        Vector with labels for super-threshold clusters after correction for
        multiple comparisons. The attribute is derived from
        ``fa.clusters_featurewise_thresh`` by removing all clusters that
        do not pass the threshold when controlling for the family-wise error
        rate.

      ``a.clusterstats``
        Record array with information on all detected clusters. The array is
        sorted according to cluster size, starting with the largest cluster
        in terms of number of features. The array contains the fields ``size``
        (number of features comprising the cluster), ``mean``, ``median``,
        min``, ``max``, ``std`` (respective descriptive statistics for all
        clusters), and ``prob_raw`` (probability of observing the cluster of a
        this size or larger under the NULL hypothesis). If correction for
        multiple comparisons is enabled an additional field ``prob_corrected``
        (probability after correction) is added.

      ``a.clusterlocations``
        Record array with information on the location of all detected clusters.
        The array is sorted according to cluster size (same order as
        ``a.clusterstats``. The array contains the fields ``max``
        (feature coordinate of the maximum score within the cluster, and
        ``center_of_mass`` (coordinate of the center of mass; weighted by
        the feature values within the cluster.

    References
    ----------
    .. [1] Johannes Stelzer, Yi Chen and Robert Turner (2013). Statistical
       inference and multiple testing correction in classification-based
       multi-voxel pattern analysis (MVPA): Random permutations and cluster
       size control. NeuroImage, 65, 69--82.
    .. [2] Smyth, G. K., & Phipson, B. (2010). Permutation P-values Should
       Never Be Zero: Calculating Exact P-values When Permutations Are
       Randomly Drawn. Statistical Applications in Genetics and Molecular
       Biology, 9, 1--12.
    """

    n_bootstrap = Parameter(
        100000, constraints=EnsureInt() & EnsureRange(min=1),
        doc="""Number of bootstrap samples to be generated from the training
            dataset. For each sample, an average map will be computed from a
            set of randomly drawn samples (one from each chunk). Bootstrap
            samples will be used to estimate a featurewise NULL distribution of
            accuracy values for initial thresholding, and to estimate the NULL
            distribution of cluster sizes under the NULL hypothesis. A larger
            number of bootstrap samples reduces the lower bound of
            probabilities, which may be beneficial for multiple comparison
            correction.""")

    feature_thresh_prob = Parameter(
        0.001, constraints=EnsureFloat() & EnsureRange(min=0.0, max=1.0),
        doc="""Feature-wise probability threshold. The value corresponding
            to this probability in the NULL distribution of accuracies will
            be used as threshold for cluster forming. Given that the NULL
            distribution is estimated per feature, the actual threshold value
            will vary across features yielding a threshold vector. The number
            of bootstrap samples need to be adequate for a desired probability.
            A ``ValueError`` is raised otherwise.""")

    chunk_attr = Parameter(
        'chunks',
        doc="""Name of the attribute indicating the individual chunks from
            which a single sample each is drawn for averaging into a bootstrap
            sample.""")

    fwe_rate = Parameter(
        0.05, constraints=EnsureFloat() & EnsureRange(min=0.0, max=1.0),
        doc="""Family-wise error rate for multiple comparison correction
            of cluster size probabilities.""")

    multicomp_correction = Parameter(
        'fdr_bh', constraints=EnsureChoice('bonferroni', 'sidak', 'holm-sidak',
                                           'holm', 'simes-hochberg', 'hommel',
                                           'fdr_bh', 'fdr_by', None),
        doc="""Strategy for multiple comparison correction of cluster
            probabilities. All methods supported by statsmodels' ``multitest``
            are available. In addition, ``None`` can be specified to disable
            correction.""")

    n_blocks = Parameter(
        1, constraints=EnsureInt() & EnsureRange(min=1),
        doc="""Number of segments used to compute the feature-wise NULL
            distributions. This parameter determines the peak memory demand.
            In case of a single segment a matrix of size
            (n_bootstrap x nfeatures) will be allocated. Increasing the number
            of segments reduces the peak memory demand by that roughly factor.
            """)

    n_proc = Parameter(
        1, constraints=EnsureInt() & EnsureRange(min=1),
        doc="""Number of parallel processes to use for computation.
            Requires `joblib` external module.""")

    def __init__(self, **kwargs):
        # force disable auto-train: would make no sense
        Learner.__init__(self, auto_train=False, **kwargs)
        if 1. / (self.params.n_bootstrap + 1) > self.params.feature_thresh_prob:
            raise ValueError('number of bootstrap samples is insufficient for'
                             ' the desired threshold probability')
        self.untrain()

    def _untrain(self):
        self._thrmap = None
        self._null_cluster_sizes = None

    @due.dcite(
        Doi("10.1016/j.neuroimage.2012.09.063"),
        description="Statistical assessment of (searchlight) MVPA results",
        tags=['implementation'])
    def _train(self, ds):
        # shortcuts
        chunk_attr = self.params.chunk_attr
        #
        # Step 0: bootstrap maps by drawing one for each chunk and average them
        # (do N iterations)
        # this could take a lot of memory, hence instead of computing the maps
        # we compute the source maps they can be computed from and then (re)build
        # the matrix of bootstrapped maps either row-wise or column-wise (as
        # needed) to save memory by a factor of (close to) `n_bootstrap`
        # which samples belong to which chunk
        chunk_samples = dict([(c, np.where(ds.sa[chunk_attr].value == c)[0])
                              for c in ds.sa[chunk_attr].unique])
        # pre-built the bootstrap combinations
        bcombos = [[random.sample(v, 1)[0] for v in list(chunk_samples.values())]
                   for i in range(self.params.n_bootstrap)]
        bcombos = np.array(bcombos, dtype=int)
        #
        # Step 1: find the per-feature threshold that corresponds to some p
        # in the NULL
        segwidth = ds.nfeatures / self.params.n_blocks
        # speed things up by operating on an array not a dataset
        ds_samples = ds.samples
        if __debug__:
            debug('GCTHR',
                  'Compute per-feature thresholds in %i blocks of %i features'
                  % (self.params.n_blocks, segwidth))
        # Execution can be done in parallel as the estimation is independent
        # across features

        def featuresegment_producer(ncols):
            for segstart in range(0, ds.nfeatures, ncols):
                # one average map for every stored bcombo
                # this also slices the input data into feature subsets
                # for the compute blocks
                yield [np.mean(
                       # get a view to a subset of the features
                       # -- should be somewhat efficient as feature axis is
                       # sliced
                       ds_samples[sidx, segstart:segstart + ncols],
                       axis=0)
                       for sidx in bcombos]
        if self.params.n_proc == 1:
            # Serial execution
            thrmap = np.hstack(  # merge across compute blocks
                [get_thresholding_map(d, self.params.feature_thresh_prob)
                 # compute a partial threshold map for as many features
                 # as fit into a compute block
                 for d in featuresegment_producer(segwidth)])
        else:
            # Parallel execution
            verbose_level_parallel = 50 \
                if (__debug__ and 'GCTHR' in debug.active) else 0
            # local import as only parallel execution needs this
            from joblib import Parallel, delayed
            # same code as above, just in parallel with joblib's Parallel
            thrmap = np.hstack(
                Parallel(n_jobs=self.params.n_proc,
                         pre_dispatch=self.params.n_proc,
                         verbose=verbose_level_parallel)(
                             delayed(get_thresholding_map)
                        (d, self.params.feature_thresh_prob)
                             for d in featuresegment_producer(segwidth)))
        # store for later thresholding of input data
        self._thrmap = thrmap
        #
        # Step 2: threshold all NULL maps and build distribution of NULL cluster
        #         sizes
        #
        cluster_sizes = Counter()
        # recompute the bootstrap average maps to threshold them and determine
        # cluster sizes
        dsa = dict(mapper=ds.a.mapper) if 'mapper' in ds.a else {}
        if __debug__:
            debug('GCTHR', 'Estimating NULL distribution of cluster sizes')
        # this step can be computed in parallel chunks to speeds things up
        if self.params.n_proc == 1:
            # Serial execution
            for sidx in bcombos:
                avgmap = np.mean(ds_samples[sidx], axis=0)[None]
                # apply threshold
                clustermap = avgmap > thrmap
                # wrap into a throw-away dataset to get the reverse mapping right
                bds = Dataset(clustermap, a=dsa)
                # this function reverse-maps every sample one-by-one, hence no need
                # to collect chunks of bootstrapped maps
                cluster_sizes = get_cluster_sizes(bds, cluster_sizes)
        else:
            # Parallel execution
            # same code as above, just restructured for joblib's Parallel
            for jobres in Parallel(n_jobs=self.params.n_proc,
                                   pre_dispatch=self.params.n_proc,
                                   verbose=verbose_level_parallel)(
                                       delayed(get_cluster_sizes)
                                  (Dataset(np.mean(ds_samples[sidx],
                                           axis=0)[None] > thrmap,
                                           a=dsa))
                                       for sidx in bcombos):
                # aggregate
                cluster_sizes += jobres
        # store cluster size histogram for later p-value evaluation
        # use a sparse matrix for easy consumption (max dim is the number of
        # features, i.e. biggest possible cluster)
        scl = dok_matrix((1, ds.nfeatures + 1), dtype=int)
        for s in cluster_sizes:
            scl[0, s] = cluster_sizes[s]
        self._null_cluster_sizes = scl

    def _call(self, ds):
        if len(ds) > 1:
            # average all samples into one, assuming we got something like one
            # sample per subject as input
            avgr = mean_sample()
            ds = avgr(ds)
        # threshold input; at this point we only have one sample left
        thrd = ds.samples[0] > self._thrmap
        # mapper default
        mapper = IdentityMapper()
        # overwrite if possible
        if hasattr(ds, 'a') and 'mapper' in ds.a:
            mapper = ds.a.mapper
        # reverse-map input
        othrd = _verified_reverse1(mapper, thrd)
        # TODO: what is your purpose in life osamp? ;-)
        osamp = _verified_reverse1(mapper, ds.samples[0])
        # prep output dataset
        outds = ds.copy(deep=False)
        outds.fa['featurewise_thresh'] = self._thrmap
        # determine clusters
        labels, num = measurements.label(othrd)
        area = measurements.sum(othrd,
                                labels,
                                index=np.arange(1, num + 1)).astype(int)
        com = measurements.center_of_mass(
            osamp, labels=labels, index=np.arange(1, num + 1))
        maxpos = measurements.maximum_position(
            osamp, labels=labels, index=np.arange(1, num + 1))
        # for the rest we need the labels flattened
        labels = mapper.forward1(labels)
        # relabel clusters starting with the biggest and increase index with
        # decreasing size
        ordered_labels = np.zeros(labels.shape, dtype=int)
        ordered_area = np.zeros(area.shape, dtype=int)
        ordered_com = np.zeros((num, len(osamp.shape)), dtype=float)
        ordered_maxpos = np.zeros((num, len(osamp.shape)), dtype=float)
        for i, idx in enumerate(np.argsort(area)):
            ordered_labels[labels == idx + 1] = num - i
            # kinda ugly, but we are looping anyway
            ordered_area[i] = area[idx]
            ordered_com[i] = com[idx]
            ordered_maxpos[i] = maxpos[idx]
        labels = ordered_labels
        area = ordered_area[::-1]
        com = ordered_com[::-1]
        maxpos = ordered_maxpos[::-1]
        del ordered_labels  # this one can be big
        # store cluster labels after forward-mapping
        outds.fa['clusters_featurewise_thresh'] = labels.copy()
        # location info
        outds.a['clusterlocations'] = \
            np.rec.fromarrays(
                [com, maxpos], names=('center_of_mass', 'max'))

        # update cluster size histogram with the actual result to get a
        # proper lower bound for p-values
        # this will make a copy, because the original matrix is int
        cluster_probs_raw = _transform_to_pvals(
            area, self._null_cluster_sizes.astype('float'))

        clusterstats = (
            [area, cluster_probs_raw],
            ['size', 'prob_raw']
        )
        # evaluate a bunch of stats for all clusters
        morestats = {}
        for cid in range(len(area)):
            # keep clusters on outer loop, because selection is more expensive
            clvals = ds.samples[0, labels == cid + 1]
            for id_, fx in (
                    ('mean', np.mean),
                    ('median', np.median),
                    ('min', np.min),
                    ('max', np.max),
                    ('std', np.std)):
                stats = morestats.get(id_, [])
                stats.append(fx(clvals))
                morestats[id_] = stats

        for k, v in list(morestats.items()):
            clusterstats[0].append(v)
            clusterstats[1].append(k)

        if self.params.multicomp_correction is not None:
            # do a local import as only this tiny portion needs statsmodels
            import statsmodels.stats.multitest as smm
            rej, probs_corr = smm.multipletests(
                cluster_probs_raw,
                alpha=self.params.fwe_rate,
                method=self.params.multicomp_correction)[:2]
            # store corrected per-cluster probabilities
            clusterstats[0].append(probs_corr)
            clusterstats[1].append('prob_corrected')
            # remove cluster labels that did not pass the FWE threshold
            for i, r in enumerate(rej):
                if not r:
                    labels[labels == i + 1] = 0
            outds.fa['clusters_fwe_thresh'] = labels
        outds.a['clusterstats'] = \
            np.rec.fromarrays(clusterstats[0], names=clusterstats[1])
        return outds
Exemple #6
0
class Hyperalignment(ClassWithCollections):
    """Align the features across multiple datasets into a common feature space.

    This is a three-level algorithm. In the first level, a series of input
    datasets is projected into a common feature space using a configurable
    mapper. The common space is initially defined by a chosen exemplar from the
    list of input datasets, but is subsequently refined by iteratively combining
    the common space with the projected input datasets.

    In the second (optional) level, the original input datasets are again
    aligned with (or projected into) the intermediate first-level common
    space. Through a configurable number of iterations the common space is
    further refined by repeated projections of the input datasets and
    combination/aggregation of these projections into an updated common space.

    In the third level, the input datasets are again aligned with the, now
    final, common feature space. The output of this algorithm are trained
    mappers (one for each input dataset) that transform the individual features
    spaces into the common space.

    Level 1 and 2 are performed by the ``train()`` method, and level 3 is
    performed when the trained Hyperalignment instance is called with a list of
    datasets. This dataset list may or may not be identical to the training
    datasets.

    The default values for the parameters of the algorithm (e.g. projection via
    Procrustean transformation, common space aggregation by averaging) resemble
    the setup reported in :ref:`Haxby et al., Neuron (2011) <HGC+11>` *A common,
    high-dimensional model of the representational space in human ventral
    temporal cortex.*

    Examples
    --------
    >>> # get some example data
    >>> from mvpa2.testing.datasets import datasets
    >>> from mvpa2.misc.data_generators import random_affine_transformation
    >>> ds4l = datasets['uni4large']
    >>> # generate a number of distorted variants of this data
    >>> dss = [random_affine_transformation(ds4l) for i in xrange(4)]
    >>> ha = Hyperalignment()
    >>> ha.train(dss)
    >>> mappers = ha(dss)
    >>> len(mappers)
    4
    """

    training_residual_errors = ConditionalAttribute(enabled=False,
            doc="""Residual error (norm of the difference between common space
                and projected data) per each training dataset at each level. The
                residuals are stored in a dataset with one row per level, and
                one column per input dataset. The first row corresponds to the
                error 1st-level of hyperalignment the remaining rows store the
                residual errors for each 2nd-level iteration.""")

    residual_errors = ConditionalAttribute(enabled=False,
            doc="""Residual error (norm of the difference between common space
                and projected data) per each dataset. The residuals are stored
                in a single-row dataset with one column per input dataset.""")

    # XXX Who cares whether it was chosen, or specified? This should be just
    # 'ref_ds'
    chosen_ref_ds = ConditionalAttribute(enabled=True,
            doc="""Index of the input dataset used as 1st-level reference
                dataset.""")

    # Lets use built-in facilities to specify parameters which
    # constructor should accept
    # the ``space`` of the mapper determines where the algorithm places the
    # common space definition in the datasets
    alignment = Parameter(ProcrusteanMapper(space='commonspace'),
            # might provide allowedtype
            # XXX Currently, there's no way to handle this with constraints
            doc="""The multidimensional transformation mapper. If
            `None` (default) an instance of
            :class:`~mvpa2.mappers.procrustean.ProcrusteanMapper` is
            used.""")

    alpha = Parameter(1, constraints=EnsureFloat() & EnsureRange(min=0, max=1),
            doc="""Regularization parameter to traverse between (Shrinkage)-CCA
                (canonical correlation analysis) and regular hyperalignment.
                Setting alpha to 1 makes the algorithm identical to
                hyperalignment and alpha of 0 makes it CCA. By default,
                it is 1, therefore hyperalignment. """)

    level2_niter = Parameter(1, constraints=EnsureInt() & EnsureRange(min=0),
            doc="Number of 2nd-level iterations.")

    ref_ds = Parameter(None, constraints=(EnsureRange(min=0) & EnsureInt()
                                          | EnsureNone()),
            doc="""Index of a dataset to use as 1st-level common space
                reference.  If `None`, then the dataset with the maximum
                number of features is used.""")

    zscore_all = Parameter(False, constraints='bool',
            doc="""Flag to Z-score all datasets prior hyperalignment.
            Turn it off if Z-scoring is not desired or was already performed.
            If True, returned mappers are ChainMappers with the Z-scoring
            prepended to the actual projection.""")

    zscore_common = Parameter(True, constraints='bool',
            doc="""Flag to Z-score the common space after each adjustment.
                This should be left enabled in most cases.""")

    combiner1 = Parameter(lambda x,y: 0.5*(x+y), #
            doc="""How to update common space in the 1st-level loop. This must
                be a callable that takes two arguments. The first argument is
                one of the input datasets after projection onto the 1st-level
                common space. The second argument is the current 1st-level
                common space. The 1st-level combiner is called iteratively for
                each projected input dataset, except for the reference dataset.
                By default the new common space is the average of the current
                common space and the recently projected dataset.""")

    combiner2 = Parameter(lambda l: np.mean(l, axis=0),
            doc="""How to combine all individual spaces to common space. This
            must be a callable that take a sequence of datasets as an argument.
            The callable must return a single array. This combiner is called
            once with all datasets after 1st-level projection to create an
            updated common space, and is subsequently called again after each
            2nd-level iteration.""")


    def __init__(self, **kwargs):
        ClassWithCollections.__init__(self, **kwargs)
        self.commonspace = None


    @due.dcite(
        Doi('10.1016/j.neuron.2011.08.026'),
        description="Hyperalignment of data to a common space",
        tags=["implementation"])
    def train(self, datasets):
        """Derive a common feature space from a series of datasets.

        Parameters
        ----------
        datasets : sequence of datasets

        Returns
        -------
        A list of trained Mappers matching the number of input datasets.
        """
        params = self.params            # for quicker access ;)
        ca = self.ca
        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]
        alpha = params.alpha

        residuals = None
        if ca['training_residual_errors'].enabled:
            residuals = np.zeros((1 + params.level2_niter, ndatasets))
            ca.training_residual_errors = Dataset(
                samples = residuals,
                sa = {'levels' :
                       ['1'] +
                       ['2:%i' % i for i in xrange(params.level2_niter)]})

        if __debug__:
            debug('HPAL', "Hyperalignment %s for %i datasets"
                  % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
        else:
            ref_ds = params.ref_ds
            # Making sure that ref_ds is within range.
            #Parameter() already checks for it being a non-negative integer
            if ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.chosen_ref_ds = ref_ds
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # TODO since we are doing in-place zscoring create deep copies
        # of the datasets with pruned targets and shallow copies of
        # the collections (if they would come needed in the transformation)
        # TODO: handle floats and non-floats differently to prevent
        #       waste of memory if there is no need (e.g. no z-scoring)
        #otargets = [ds.sa.targets for ds in datasets]
        datasets = [ds.copy(deep=False) for ds in datasets]
        #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)})
        #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)})
        #            for ds in datasets]

        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        if alpha < 1:
            datasets, wmappers = self._regularize(datasets, alpha)

        # initial common space is the reference dataset
        commonspace = datasets[ref_ds].samples
        # the reference dataset might have been zscored already, don't do it
        # twice
        if params.zscore_common and not params.zscore_all:
            if __debug__:
                debug('HPAL_',
                      "Creating copy of a commonspace and assuring "
                      "it is of a floating type")
            commonspace = commonspace.astype(float)
            zscore(commonspace, chunks_attr=None)

        # create a mapper per dataset
        # might prefer some other way to initialize... later
        mappers = [deepcopy(params.alignment) for ds in datasets]

        #
        # Level 1 -- initial projection
        #
        lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers,
                                     residuals)
        #
        # Level 2 -- might iterate multiple times
        #
        # this is the final common space
        self.commonspace = self._level2(datasets, lvl1_projdata, mappers,
                                        residuals)


    def __call__(self, datasets):
        """Derive a common feature space from a series of datasets.

        Parameters
        ----------
        datasets : sequence of datasets

        Returns
        -------
        A list of trained Mappers matching the number of input datasets.
        """
        if self.commonspace is None:
            self.train(datasets)

        # place datasets into a copy of the list since items
        # will be reassigned
        datasets = list(datasets)

        params = self.params            # for quicker access ;)
        alpha = params.alpha             # for letting me be lazy ;)
        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            # zscore them once while storing corresponding ZScoreMapper's
            # so we can assemble a comprehensive mapper at the end
            # (together with procrustes)
            zmappers = []
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmappers.append(zmapper)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        if alpha < 1:
            datasets, wmappers = self._regularize(datasets, alpha)

        #
        # Level 3 -- final, from-scratch, alignment to final common space
        #
        mappers = self._level3(datasets)
        # return trained mappers for projection from all datasets into the
        # common space
        if params.zscore_all:
            # We need to construct new mappers which would chain
            # zscore and then final transformation
            if params.alpha < 1:
                return [ChainMapper([zm, wm, m]) for zm, wm, m in zip(zmappers, wmappers, mappers)]
            else:
                return [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)]
        else:
            if params.alpha < 1:
                return [ChainMapper([wm, m]) for wm, m in zip(wmappers, mappers)]
            else:
                return mappers


    def _regularize(self, datasets, alpha):
        if __debug__:
            debug('HPAL', "Using regularized hyperalignment with alpha of %d"
                    % alpha)
        wmappers = []
        for ids in xrange(len(datasets)):
            U, S, Vh = np.linalg.svd(datasets[ids])
            S = 1/np.sqrt( (1-alpha)*np.square(S) + alpha )
            S.resize(len(Vh))
            S = np.matrix(np.diag(S))
            W = np.matrix(Vh.T)*S*np.matrix(Vh)
            wmapper = StaticProjectionMapper(proj=W, auto_train=False)
            wmapper.train(datasets[ids])
            wmappers.append(wmapper)
            datasets[ids] = wmapper.forward(datasets[ids])
        return datasets, wmappers


    def _level1(self, datasets, commonspace, ref_ds, mappers, residuals):
        params = self.params            # for quicker access ;)
        data_mapped = [ds.samples for ds in datasets]
        for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 1: ds #%i" % i)
            if i == ref_ds:
                continue
            # assign common space to ``space`` of the mapper, because this is
            # where it will be looking for it
            ds_new.sa[m.get_space()] = commonspace
            # find transformation of this dataset into the current common space
            m.train(ds_new)
            # remove common space attribute again to save on memory when the
            # common space is updated for the next iteration
            del ds_new.sa[m.get_space()]
            # project this dataset into the current common space
            ds_ = m.forward(ds_new.samples)
            if params.zscore_common:
                zscore(ds_, chunks_attr=None)
            # replace original dataset with mapped one -- only the reference
            # dataset will remain unchanged
            data_mapped[i] = ds_

            # compute first-level residuals wrt to the initial common space
            if residuals is not None:
                residuals[0, i] = np.linalg.norm(ds_ - commonspace)

            # Update the common space. This is an incremental update after
            # processing each 1st-level dataset. Maybe there should be a flag
            # to make a batch update after processing all 1st-level datasets
            # to an identical 1st-level common space
            # TODO: make just a function so we dont' waste space
            commonspace = params.combiner1(ds_, commonspace)
            if params.zscore_common:
                zscore(commonspace, chunks_attr=None)
        return data_mapped


    def _level2(self, datasets, lvl1_data, mappers, residuals):
        params = self.params            # for quicker access ;)
        data_mapped = lvl1_data
        # aggregate all processed 1st-level datasets into a new 2nd-level
        # common space
        commonspace = params.combiner2(data_mapped)

        # XXX Why is this commented out? Who knows what combiner2 is doing and
        # whether it changes the distribution of the data
        #if params.zscore_common:
        #zscore(commonspace, chunks_attr=None)

        ndatasets = len(datasets)
        for loop in xrange(params.level2_niter):
            # 2nd-level alignment starts from the original/unprojected datasets
            # again
            for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
                if __debug__:
                    debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i))

                # Optimization speed up heuristic
                # Slightly modify the common space towards other feature
                # spaces and reduce influence of this feature space for the
                # to-be-computed projection
                temp_commonspace = (commonspace * ndatasets - data_mapped[i]) \
                                    / (ndatasets - 1)

                if params.zscore_common:
                    zscore(temp_commonspace, chunks_attr=None)
                # assign current common space
                ds_new.sa[m.get_space()] = temp_commonspace
                # retrain the mapper for this dataset
                m.train(ds_new)
                # remove common space attribute again to save on memory when the
                # common space is updated for the next iteration
                del ds_new.sa[m.get_space()]
                # obtain the 2nd-level projection
                ds_ =  m.forward(ds_new.samples)
                if params.zscore_common:
                    zscore(ds_, chunks_attr=None)
                # store for 2nd-level combiner
                data_mapped[i] = ds_
                # compute residuals
                if residuals is not None:
                    residuals[1+loop, i] = np.linalg.norm(ds_ - commonspace)

            commonspace = params.combiner2(data_mapped)

        # and again
        if params.zscore_common:
            zscore(commonspace, chunks_attr=None)

        # return the final common space
        return commonspace


    def _level3(self, datasets):
        params = self.params            # for quicker access ;)
        # create a mapper per dataset
        mappers = [deepcopy(params.alignment) for ds in datasets]

        # key different from level-2; the common space is uniform
        #temp_commonspace = commonspace

        residuals = None
        if self.ca['residual_errors'].enabled:
            residuals = np.zeros((1, len(datasets)))
            self.ca.residual_errors = Dataset(samples=residuals)

        # start from original input datasets again
        for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 3: ds #%i" % i)

            # retrain mapper on final common space
            ds_new.sa[m.get_space()] = self.commonspace
            m.train(ds_new)
            # remove common space attribute again to save on memory
            del ds_new.sa[m.get_space()]

            if residuals is not None:
                # obtain final projection
                data_mapped = m.forward(ds_new.samples)
                residuals[0, i] = np.linalg.norm(data_mapped - self.commonspace)

        return mappers
class SearchlightHyperalignment(ClassWithCollections):
    """
    Given a list of datasets, provide a list of mappers
    into common space using searchlight based hyperalignment.
    :ref:`Guntupalli et al., Cerebral Cortex (2016)`

    1) Input datasets should all be of the same size in terms of
    nsamples and nfeatures, and be coarsely aligned (using anatomy).
    2) All features in all datasets should be zscored.
    3) Datasets should have feature attribute `voxel_indices`
    containing spatial coordinates of all features
    """

    # TODO: add {training_,}residual_errors .ca ?

    ## Parameters common with Hyperalignment but overriden

    ref_ds = Parameter(0, constraints=EnsureInt() & EnsureRange(min=0),
        doc="""Index of a dataset to use as a reference. First dataset is used
            as default. If you supply exclude_from_model list, you should supply
            the ref_ds index as index before you remove those excluded datasets.
            Note that unlike regular Hyperalignment, there is no automagic
            choosing of the "best" ref_ds by default.""")

    ## Parameters specific to SearchlightHyperalignment

    queryengine = Parameter(
        None,
        doc="""A single (or a list of query engines, one per each dataset) to be
        used.  If not provided, volumetric searchlight, with spherical
        neighborhood as instructed by radius parameter will be used.""")

    radius = Parameter(
        3,
        constraints=EnsureInt() & EnsureRange(min=1),
        doc="""Radius of a searchlight sphere in number of voxels to be used if
         no `queryengine` argument was provided.""")

    nproc = Parameter(
        1,
        constraints=EnsureInt() & EnsureRange(min=1) | EnsureNone(),
        doc="""Number of cores to use.""")

    nblocks = Parameter(
        None,
        constraints=EnsureInt() & EnsureRange(min=1) | EnsureNone(),
        doc="""Number of blocks to divide to process. Higher number results in
            smaller memory consumption.""")

    sparse_radius = Parameter(
        None,
        constraints=(EnsureRange(min=1) & EnsureInt() | EnsureNone()),
        doc="""Radius supplied to scatter_neighborhoods in units of voxels.
            This is effectively the distance between the centers where
            hyperalignment is performed in searchlights.  ATM applicable only
            if no custom queryengine was provided.
            If None, hyperalignment is performed at every voxel (default).""")

    hyperalignment = Parameter(
        Hyperalignment(ref_ds=None),
        doc="""Hyperalignment instance to be used in each searchlight sphere.
            Default is just the Hyperalignment instance with default
            parameters. Its `ref_ds` parameter would be overridden by the
            `ref_ds` parameter of this SearchlightHyperalignment instance
            because we want to be consistent and only need one `ref_ds`.""")

    combine_neighbormappers = Parameter(
        True,
        constraints=EnsureBool(),
        doc="""This param determines whether to combine mappers for each voxel
            from its neighborhood searchlights or just use the mapper for which
            it is the center voxel. This will not be applicable for certain
            queryengines whose ids and neighborhoods are from different spaces,
            such as for SurfaceVerticesQueryEngine""")

    compute_recon = Parameter(
        True,
        constraints=EnsureBool(),
        doc="""This param determines whether to compute reverse mappers for each
            subject from common-space to subject space. These will be stored in
            the StaticProjectionMapper() and used when reverse() is called.
            Enabling it will double the size of the mappers returned.""")

    featsel = Parameter(
        1.0,
        constraints=EnsureFloat() & EnsureRange(min=0.0, max=1.0) |
            EnsureInt() & EnsureRange(min=2),
        doc="""Determines if feature selection will be performed in each searchlight.
            1.0: Use all features. < 1.0 is understood as selecting that
            proportion of features in each searchlight of ref_ds using feature scores;
            > 1.0 is understood as selecting at most that many features in each
            searchlight.""")

    # TODO: Should we get rid of this feature?
    use_same_features = Parameter(
        False,
        constraints=EnsureBool(),
        doc="""Select the same (best) features when doing feature selection for
            all datasets.""")

    exclude_from_model = Parameter(
        [],
        constraints=EnsureListOf(int),
        doc="""List of dataset indices that will not participate in building
            common model.  These will still get mappers back but they don't
            influence the model or voxel selection.""")

    mask_node_ids = Parameter(
        None,
        constraints=EnsureListOf(int) | EnsureNone(),
        doc="""You can specify a mask to compute searchlight hyperalignment only
            within this mask.  These would be a list of voxel indices.""")

    dtype = Parameter(
        'float32',
        constraints='str',
        doc="""dtype of elements transformation matrices to save on memory for
            big datasets""")

    results_backend = Parameter(
        'hdf5',
        constraints=EnsureChoice('hdf5', 'native'),
        doc="""'hdf5' or 'native'. See Searchlight documentation.""")

    tmp_prefix = Parameter(
        'tmpsl',
        constraints='str',
        doc="""Prefix for temporary files. See Searchlight documentation.""")

    def __init__(self, **kwargs):
        _shpaldebug("Initializing.")
        ClassWithCollections.__init__(self, **kwargs)
        self.ndatasets = 0
        self.nfeatures = 0
        self.projections = None
        # This option makes the roi_seed in each SL to be selected during feature selection
        self.force_roi_seed = True
        if self.params.nproc is not None and self.params.nproc > 1 \
                and not externals.exists('pprocess'):
            raise RuntimeError("The 'pprocess' module is required for "
                               "multiprocess searchlights. Please either "
                               "install python-pprocess, or reduce `nproc` "
                               "to 1 (got nproc=%i) or set to default None"
                               % self.params.nproc)
        if not externals.exists('scipy'):
            raise RuntimeError("The 'scipy' module is required for "
                               "searchlight hyperalignment.")
        if self.params.results_backend == 'native':
            raise NotImplementedError("'native' mode to handle results is still a "
                                      "work in progress.")
            #warning("results_backend is set to 'native'. This has been known"
            #        "to result in longer run time when working with big datasets.")
        if self.params.results_backend == 'hdf5' and \
                not externals.exists('h5py'):
            raise RuntimeError("The 'hdf5' module is required for "
                               "when results_backend is set to 'hdf5'")

    def _proc_block(self, block, datasets, featselhyper, queryengines, seed=None, iblock='main'):
        if seed is not None:
            mvpa2.seed(seed)
        if __debug__:
            debug('SLC', 'Starting computing block for %i elements' % len(block))
        bar = ProgressBar()
        projections = [csc_matrix((self.nfeatures, self.nfeatures),
                                  dtype=self.params.dtype)
                       for isub in range(self.ndatasets)]
        for i, node_id in enumerate(block):
            # retrieve the feature ids of all features in the ROI from the query
            # engine

            # Find the neighborhood for that selected nearest node
            roi_feature_ids_all = [qe[node_id] for qe in queryengines]
            # handling queryengines that return AttrDatasets
            for isub in range(len(roi_feature_ids_all)):
                if is_datasetlike(roi_feature_ids_all[isub]):
                    # making sure queryengine returned proper shaped output
                    assert(roi_feature_ids_all[isub].nsamples == 1)
                    roi_feature_ids_all[isub] = roi_feature_ids_all[isub].samples[0, :].tolist()
            if len(roi_feature_ids_all) == 1:
                # just one was provided to be "broadcasted"
                roi_feature_ids_all *= len(datasets)
            # if qe returns zero-sized ROI for any subject, pass...
            if any(len(x)==0 for x in roi_feature_ids_all):
                continue
            # selecting neighborhood for all subject for hyperalignment
            ds_temp = [sd[:, ids] for sd, ids in zip(datasets, roi_feature_ids_all)]
            if self.force_roi_seed:
                roi_seed = np.array(roi_feature_ids_all[self.params.ref_ds]) == node_id
                ds_temp[self.params.ref_ds].fa['roi_seed'] = roi_seed
            if __debug__:
                msg = 'ROI (%i/%i), %i features' % (i + 1, len(block),
                                                    ds_temp[self.params.ref_ds].nfeatures)
                debug('SLC', bar(float(i + 1) / len(block), msg), cr=True)
            hmappers = featselhyper(ds_temp)
            assert(len(hmappers) == len(datasets))
            roi_feature_ids_ref_ds = roi_feature_ids_all[self.params.ref_ds]
            for isub, roi_feature_ids in enumerate(roi_feature_ids_all):
                if not self.params.combine_neighbormappers:
                    I = roi_feature_ids
                    #J = [roi_feature_ids[node_id]] * len(roi_feature_ids)
                    J = [node_id] * len(roi_feature_ids)
                    V = hmappers[isub].tolist()
                    if np.isscalar(V):
                        V = [V]
                else:
                    I, J, V = [], [], []
                    for f2, roi_feature_id_ref_ds in enumerate(roi_feature_ids_ref_ds):
                        I += roi_feature_ids
                        J += [roi_feature_id_ref_ds] * len(roi_feature_ids)
                        V += hmappers[isub][:, f2].tolist()
                proj = coo_matrix(
                    (V, (I, J)),
                    shape=(max(self.nfeatures, max(I) + 1), max(self.nfeatures, max(J) + 1)),
                    dtype=self.params.dtype)
                proj = proj.tocsc()
                # Cleaning up the current subject's projections to free up memory
                hmappers[isub] = [[] for _ in hmappers]
                projections[isub] = projections[isub] + proj

        if self.params.results_backend == 'native':
            return projections
        elif self.params.results_backend == 'hdf5':
            # store results in a temporary file and return a filename
            results_file = mktemp(prefix=self.params.tmp_prefix,
                                  suffix='-%s.hdf5' % iblock)
            if __debug__:
                debug('SLC', "Storing results into %s" % results_file)
            h5save(results_file, projections)
            if __debug__:
                debug('SLC_', "Results stored")
            return results_file
        else:
            raise RuntimeError("Must not reach this point")

    def __handle_results(self, results):
        if self.params.results_backend == 'hdf5':
            # 'results' must be just a filename
            assert(isinstance(results, str))
            if __debug__:
                debug('SLC', "Loading results from %s" % results)
            results_data = h5load(results)
            os.unlink(results)
            if __debug__:
                debug('SLC_', "Loaded results of len=%d from"
                      % len(results_data))
            for isub, res in enumerate(results_data):
                self.projections[isub] = self.projections[isub] + res
            if __debug__:
                debug('SLC_', "Finished adding results")
            return

    def __handle_all_results(self, results):
        """Helper generator to decorate passing the results out to
        results_fx
        """
        for r in results:
            yield self.__handle_results(r)

    @due.dcite(
        Doi('10.1093/cercor/bhw068'),
        description="Full cortex hyperalignment of data to a common space",
        tags=["implementation"])
    def __call__(self, datasets):
        """Estimate mappers for each dataset using searchlight-based
        hyperalignment.

        Parameters
        ----------
          datasets : list or tuple of datasets

        Returns
        -------
        A list of trained StaticProjectionMappers of the same length as datasets
        """

        # Perform some checks first before modifying internal state
        params = self.params
        ndatasets = len(datasets)

        if len(datasets) <= 1:
            raise ValueError("SearchlightHyperalignment needs > 1 dataset to "
                             "operate on. Got: %d" % self.ndatasets)

        if params.ref_ds in params.exclude_from_model:
            raise ValueError("Requested reference dataset %i is also "
                             "in the exclude list." % params.ref_ds)

        if params.ref_ds >= ndatasets:
            raise ValueError("Requested reference dataset %i is out of "
                             "bounds. We have only %i datasets provided"
                             % (params.ref_ds, self.ndatasets))

        # The rest of the checks are just warnings
        self.ndatasets = ndatasets

        _shpaldebug("SearchlightHyperalignment %s for %i datasets"
                    % (self, self.ndatasets))

        selected = [_ for _ in range(ndatasets)
                    if _ not in params.exclude_from_model]
        ref_ds_train = selected.index(params.ref_ds)
        params.hyperalignment.params.ref_ds = ref_ds_train
        warning('Using %dth dataset as the reference dataset (%dth after '
                'excluding datasets)' % (params.ref_ds, ref_ds_train))
        if len(params.exclude_from_model) > 0:
            warning("These datasets will not participate in building common "
                    "model: %s" % params.exclude_from_model)

        if __debug__:
            # verify that datasets were zscored prior the alignment since it is
            # assumed/required preprocessing step
            for ids, ds in enumerate(datasets):
                for f, fname, tval in ((np.mean, 'means', 0),
                                       (np.std, 'stds', 1)):
                    vals = f(ds, axis=0)
                    vals_comp = np.abs(vals - tval) > 1e-5
                    if np.any(vals_comp):
                        warning('%d %s are too different (max diff=%g) from %d in '
                                'dataset %d to come from a zscored dataset. '
                                'Please zscore datasets first for correct operation '
                                '(unless if was intentional)'
                                % (np.sum(vals_comp), fname,
                                   np.max(np.abs(vals)), tval, ids))

        # Setting up SearchlightHyperalignment
        # we need to know which original features where comprising the
        # individual SL ROIs
        _shpaldebug('Initializing FeatureSelectionHyperalignment.')
        hmeasure = FeatureSelectionHyperalignment(
            ref_ds=params.ref_ds,
            featsel=params.featsel,
            hyperalignment=params.hyperalignment,
            full_matrix=params.combine_neighbormappers,
            use_same_features=params.use_same_features,
            exclude_from_model=params.exclude_from_model,
            dtype=params.dtype)

        # Performing SL processing manually
        _shpaldebug("Setting up for searchlights")
        if params.nproc is None and externals.exists('pprocess'):
            import pprocess
            try:
                params.nproc = pprocess.get_number_of_cores() or 1
            except AttributeError:
                warning("pprocess version %s has no API to figure out maximal "
                        "number of cores. Using 1"
                        % externals.versions['pprocess'])
                params.nproc = 1

        # XXX I think this class should already accept a single dataset only.
        # It should have a ``space`` setting that names a sample attribute that
        # can be used to identify individual/original datasets.
        # Taking a single dataset as argument would be cleaner, because the
        # algorithm relies on the assumption that there is a coarse feature
        # alignment, i.e. the SL ROIs cover roughly the same area
        queryengines = self._get_trained_queryengines(
            datasets, params.queryengine, params.radius, params.ref_ds)
        # For surface nodes to voxels queryengines, roi_seed hardly makes sense
        qe = queryengines[(0 if len(queryengines) == 1 else params.ref_ds)]
        if isinstance(qe, SurfaceVerticesQueryEngine):
            self.force_roi_seed = False
            if not self.params.combine_neighbormappers:
                raise NotImplementedError("Mapping from voxels to surface nodes is not "
                        "implmented yet. Try setting combine_neighbormappers to True.")
        self.nfeatures = datasets[params.ref_ds].nfeatures
        _shpaldebug("Performing Hyperalignment in searchlights")
        # Setting up centers for running SL Hyperalignment
        if params.sparse_radius is None:
            roi_ids = self._get_verified_ids(queryengines) \
                if params.mask_node_ids is None \
                else params.mask_node_ids
        else:
            if params.queryengine is not None:
                raise NotImplementedError(
                    "using sparse_radius whenever custom queryengine is "
                    "provided is not yet supported.")
            _shpaldebug("Setting up sparse neighborhood")
            from mvpa2.misc.neighborhood import scatter_neighborhoods
            if params.mask_node_ids is None:
                scoords, sidx = scatter_neighborhoods(
                    Sphere(params.sparse_radius),
                    datasets[params.ref_ds].fa.voxel_indices,
                    deterministic=True)
                roi_ids = sidx
            else:
                scoords, sidx = scatter_neighborhoods(
                    Sphere(params.sparse_radius),
                    datasets[params.ref_ds].fa.voxel_indices[params.mask_node_ids],
                    deterministic=True)
                roi_ids = [params.mask_node_ids[sid] for sid in sidx]

        # Initialize projections
        _shpaldebug('Initializing projection matrices')
        self.projections = [
            csc_matrix((self.nfeatures, self.nfeatures), dtype=params.dtype)
            for isub in range(self.ndatasets)]

        # compute
        if params.nproc is not None and params.nproc > 1:
            # split all target ROIs centers into `nproc` equally sized blocks
            nproc_needed = min(len(roi_ids), params.nproc)
            params.nblocks = nproc_needed \
                if params.nblocks is None else params.nblocks
            params.nblocks = min(len(roi_ids), params.nblocks)
            node_blocks = np.array_split(roi_ids, params.nblocks)
            # the next block sets up the infrastructure for parallel computing
            # this can easily be changed into a ParallelPython loop, if we
            # decide to have a PP job server in PyMVPA
            import pprocess
            p_results = pprocess.Map(limit=nproc_needed)
            if __debug__:
                debug('SLC', "Starting off %s child processes for nblocks=%i"
                      % (nproc_needed, params.nblocks))
            compute = p_results.manage(
                        pprocess.MakeParallel(self._proc_block))
            seed = mvpa2.get_random_seed()
            for iblock, block in enumerate(node_blocks):
                # should we maybe deepcopy the measure to have a unique and
                # independent one per process?
                compute(block, datasets, copy.copy(hmeasure), queryengines,
                        seed=seed, iblock=iblock)
        else:
            # otherwise collect the results in an 1-item list
            _shpaldebug('Using 1 process to compute mappers.')
            if params.nblocks is None:
                params.nblocks = 1
            params.nblocks = min(len(roi_ids), params.nblocks)
            node_blocks = np.array_split(roi_ids, params.nblocks)
            p_results = [self._proc_block(block, datasets, hmeasure, queryengines)
                         for block in node_blocks]
        results_ds = self.__handle_all_results(p_results)
        # Dummy iterator for, you know, iteration
        list(results_ds)

        _shpaldebug('Wrapping projection matrices into StaticProjectionMappers')
        self.projections = [
            StaticProjectionMapper(proj=proj, recon=proj.T) if params.compute_recon
            else StaticProjectionMapper(proj=proj)
            for proj in self.projections]
        return self.projections

    def _get_verified_ids(self, queryengines):
        """Helper to return ids of queryengines, verifying that they are the same"""
        qe0 = queryengines[0]
        roi_ids = qe0.ids
        for qe in queryengines:
            if qe is not qe0:
                # if a different query engine (so wasn't just replicated)
                if np.any(qe.ids != qe0.ids):
                    raise RuntimeError(
                        "Query engine %s provided different ids than %s. Not supported"
                        % (qe0, qe))
        return roi_ids

    def _get_trained_queryengines(self, datasets, queryengine, radius, ref_ds):
        """Helper to return trained query engine(s), either list of one or one per each dataset

        if queryengine is None then IndexQueryEngine based on radius is created
        """
        ndatasets = len(datasets)
        if queryengine:
            if isinstance(queryengine, (list, tuple)):
                queryengines = queryengine
                if len(queryengines) != ndatasets:
                    raise ValueError(
                        "%d query engines were specified although %d datasets "
                        "provided" % (len(queryengines), ndatasets))
                _shpaldebug("Training provided query engines")
                for qe, ds in zip(queryengines, datasets):
                    qe.train(ds)
            else:
                queryengine.train(datasets[ref_ds])
                queryengines = [queryengine]
        else:
            _shpaldebug('No custom query engines were provided. Setting up the '
                        'volumetric query engine on voxel_indices.')
            queryengine = IndexQueryEngine(voxel_indices=Sphere(radius))
            queryengine.train(datasets[ref_ds])
            queryengines = [queryengine]
        return queryengines

# A little debug helper to avoid constant if __debug__ conditioning,
# but it also means that debugging could not be activated at run time
# after the import of this module
def _shpaldebug(*args):
    pass
if __debug__:
    from mvpa2.base import debug
    if 'SHPAL' in debug.active:
        def _shpaldebug(msg):
            debug('SHPAL', "%s" % msg)


@due.dcite(
    Doi('10.1016/j.neuron.2011.08.026'),
    description="Per-feature measure of maximal correlation to features in other datasets",
    tags=["implementation"])
def compute_feature_scores(datasets, exclude_from_model=None):
    """
    Takes a list of datasets and computes a magical feature
    score for each feature in each dataset
    :ref:`Haxby et al., Neuron (2011) <HGC+11>`

    Parameters
    ----------
    datasets : list or tuple of datasets

    exclude_from_model: list of dataset indices that won't participate
                    in voxel selection of others