Exemple #1
0
    def handle_arg(arg):
        """Helper which would read in SpatialImage if necessary
        """
        if arg is None:
            return arg
        if isinstance(arg, basestring):
            arg = nb.load(arg)
            argshape = arg.get_shape()
            # Assure that we have 3D (at least)
            if len(argshape)<3:
                arg = nb.Nifti1Image(
                        arg.get_data().reshape(argshape + (1,)*(3-len(argshape))),
                        arg.get_affine(),
                        arg.get_header())
        else:
            argshape = arg.shape

        if len(argshape) == 4:
            if argshape[-1] > 1:
                warning("For now plot_lightbox can handle only 3d, 4d data was provided."
                        " Plotting only the first volume")
            if isinstance(arg, SpatialImage):
                arg = nb.Nifti1Image(arg.get_data()[..., 0], arg.get_affine(), arg.get_header())
            else:
                arg = arg[..., 0]
        elif len(argshape) != 3:
            raise ValueError, "For now just handling 3D volumes"
        return arg
Exemple #2
0
def _check_cosmo_dataset(cosmo):
    '''
    Helper function to ensure a cosmo input for cosmo_dataset is valid.
    Currently does two things:
    (1) raise an error if there are no samples
    (2) raise a warning if samples have very large or very small values. A use
        case is certain MEEG datasets with very small sample values
        (in the order of 1e-25) which affects some classifiers
    '''

    samples = cosmo.get('samples', None)

    if samples is None:
        raise KeyError("Missing field .samples in %s" % cosmo)

    # check for extreme values
    warn_for_extreme_values_decimals = 10

    # ignore NaNs and infinity
    nonzero_msk = np.logical_and(np.isfinite(samples), samples != 0)

    if np.any(nonzero_msk):
        max_nonzero = np.max(np.abs(samples[nonzero_msk]))

        # see how many decimals in the largest absolute value
        decimals_nonzero = np.log10(max_nonzero)

        if abs(decimals_nonzero) > warn_for_extreme_values_decimals:
            msg = (
                'Samples have extreme values, maximum absolute value is %s; '
                'This may affect some analyses. Considering scaling the samples, '
                'e.g. by a factor of 10**%d ' %
                (max_nonzero, -decimals_nonzero))
            warning(msg)
Exemple #3
0
 def stability_assurance(cdf):
     if __debug__ and 'CHECK_STABILITY' in debug.active:
         cdf_min, cdf_max = np.min(cdf), np.max(cdf)
         if cdf_min < 0 or cdf_max > 1.0:
             s = ('', ' for %s' % name)[int(name is not None)]
             warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \
                     (cdf_func, s, cdf_min, cdf_max))
Exemple #4
0
    def label_voxel(self, c, levels = None):

        if self.__referenceLevel is None:
            warning("You did not provide what level to use "
                    "for reference. Assigning 0th level -- '%s'"
                    % (self._levels[0],))
            self.set_reference_level(0)
            # return self.__referenceAtlas.label_voxel(c, levels)

        c = self._check_range(c)

        # obtain coordinates of the closest voxel
        cref = self._data[ self.__referenceLevel.indexes, c[0], c[1], c[2] ]
        dist = norm( (cref - c) * self.voxdim )
        if __debug__:
            debug('ATL__', "Closest referenced point for %r is "
                  "%r at distance %3.2f" % (c, cref, dist))
        if (self.distance - dist) >= 1e-3: # neglect everything smaller
            result = self.__referenceAtlas.label_voxel(cref, levels)
            result['voxel_referenced'] = c
            result['distance'] = dist
        else:
            result = self.__referenceAtlas.label_voxel(c, levels)
            if __debug__:
                debug('ATL__', "Closest referenced point is "
                      "further than desired distance %.2f" % self.distance)
            result['voxel_referenced'] = None
            result['distance'] = 0
        return result
Exemple #5
0
    def __init__(self, **kwargs):
        """Initialize an SMLR classifier.
        """

        """
        TODO:
         # Add in likelihood calculation
         # Add kernels, not just direct methods.
         """
        # init base class first
        Classifier.__init__(self, **kwargs)

        if _cStepwiseRegression is None and self.params.implementation == 'C':
            warning('SMLR: C implementation is not available.'
                    ' Using pure Python one')
            self.params.implementation = 'Python'

        # pylint friendly initializations
        self._ulabels = None
        """Unigue labels from the training set."""
        self.__weights_all = None
        """Contains all weights including bias values"""
        self.__weights = None
        """Just the weights, without the biases"""
        self.__biases = None
        """The biases, will remain none if has_bias is False"""
Exemple #6
0
def _SLcholesky_autoreg(C, nsteps=None, **kwargs):
    """Simple wrapper around cholesky to incrementally regularize the
    matrix until successful computation.

    For `nsteps` we boost diagonal 10-fold each time from the
    'epsilon' of the respective dtype. If None -- would proceed until
    reaching 1.
    """
    if nsteps is None:
        nsteps = -int(np.floor(np.log10(np.finfo(float).eps)))
    result = None
    for step in xrange(nsteps):
        epsilon_value = (10**step) * np.finfo(C.dtype).eps
        epsilon = epsilon_value * np.eye(C.shape[0])
        try:
            result = SLcholesky(C + epsilon, lower=True)
        except SLAError, e:
            warning("Cholesky decomposition lead to failure: %s.  "
                    "As requested, performing auto-regularization but "
                    "for better control you might prefer to regularize "
                    "yourself by providing lm parameter to GPR" % e)
            if step < nsteps - 1:
                if __debug__:
                    debug(
                        "GPR", "Failed to obtain cholesky on "
                        "auto-regularization step %d value %g. Got %s."
                        " Boosting lambda more to reg. C." %
                        (step, epsilon_value, e))
                continue
            else:
                raise
Exemple #7
0
    def to_npz(self, filename, compress=True):
        """Save dataset to a .npz file storing all fa/sa/a which are ndarrays

        Parameters
        ----------
        filename : str
        compress : bool, optional
          If True, savez_compressed is used
        """
        savez = np.savez_compressed if compress else np.savez
        if not filename.endswith('.npz'):
            filename += '.npz'
        entries = {'samples': self.samples}
        skipped = []
        for c in ('a', 'fa', 'sa'):
            col = getattr(self, c)
            for k in col:
                v = col[k].value
                e = '%s.%s' % (c, k)
                if isinstance(v, np.ndarray):
                    entries[e] = v
                else:
                    skipped.append(e)
        if skipped:
            warning("Skipping %s since not ndarrays" % (', '.join(skipped)))
        return savez(filename, **entries)
Exemple #8
0
    def to_npz(self, filename, compress=True):
        """Save dataset to a .npz file storing all fa/sa/a which are ndarrays

        Parameters
        ----------
        filename : str
        compress : bool, optional
          If True, savez_compressed is used
        """
        savez = np.savez_compressed if compress else np.savez
        if not filename.endswith('.npz'):
            filename += '.npz'
        entries = {'samples': self.samples}
        skipped = []
        for c in ('a', 'fa', 'sa'):
            col = getattr(self, c)
            for k in col:
                v = col[k].value
                e = '%s.%s' % (c, k)
                if isinstance(v, np.ndarray):
                    entries[e] = v
                else:
                    skipped.append(e)
        if skipped:
            warning("Skipping %s since not ndarrays" % (', '.join(skipped)))
        return savez(filename, **entries)
Exemple #9
0
def run(args):
    if not args.store is None and args.output is None:
        raise ValueError("--output is require for result storage")
    if not args.data is None:
        dss = [arg2ds(d) for d in args.data]
        if len(dss):
            # convenience short-cut
            ds = dss[0]
    try:
        import nose.tools as nt
    except ImportError:
        pass
    for expr in args.eval:
        if expr == '-':
            exec sys.stdin
        elif os.path.isfile(expr):
            execfile(expr, globals(), locals())
        else:
            exec expr
    if not args.store is None:
        out = {}
        for var in args.store:
            try:
                out[var] = locals()[var]
            except KeyError:
                warning("'%s' not found in local name space -- skipped." % var)
        if len(out):
            ds2hdf5(out, args.output, compression=args.hdf5_compression)
Exemple #10
0
def _SLcholesky_autoreg(C, nsteps=None, **kwargs):
    """Simple wrapper around cholesky to incrementally regularize the
    matrix until successful computation.

    For `nsteps` we boost diagonal 10-fold each time from the
    'epsilon' of the respective dtype. If None -- would proceed until
    reaching 1.
    """
    if nsteps is None:
        nsteps = -int(np.floor(np.log10(np.finfo(float).eps)))
    result = None
    for step in xrange(nsteps):
        epsilon_value = (10**step) * np.finfo(C.dtype).eps
        epsilon = epsilon_value * np.eye(C.shape[0])
        try:
            result = SLcholesky(C + epsilon, lower=True)
        except SLAError, e:
            warning("Cholesky decomposition lead to failure: %s.  "
                    "As requested, performing auto-regularization but "
                    "for better control you might prefer to regularize "
                    "yourself by providing lm parameter to GPR" % e)
            if step < nsteps-1:
                if __debug__:
                    debug("GPR", "Failed to obtain cholesky on "
                          "auto-regularization step %d value %g. Got %s."
                          " Boosting lambda more to reg. C."
                          % (step, epsilon_value, e))
                continue
            else:
                raise
def run(args):
    if args.store is not None and args.output is None:
        raise ValueError("--output is require for result storage")
    if args.data is not None:
        dss = [arg2ds(d) for d in args.data]
        if len(dss):
            # convenience short-cut
            ds = dss[0]
    try:
        import nose.tools as nt
    except ImportError:
        pass
    for expr in args.eval:
        if expr == '-':
            exec sys.stdin
        elif os.path.isfile(expr):
            execfile(expr, globals(), locals())
        else:
            exec expr
    if args.store is not None:
        out = {}
        for var in args.store:
            try:
                out[var] = locals()[var]
            except KeyError:
                warning("'%s' not found in local name space -- skipped." % var)
        if len(out):
            ds2hdf5(out, args.output, compression=args.hdf5_compression)
Exemple #12
0
    def _check(self):
        '''ensures that different fields are sort of consistent'''
        fields = ['_v', '_f', '_nv', '_nf']
        if not all(hasattr(self, field) for field in fields):
            raise Exception("Incomplete surface!")

        if self._v.shape != (self._nv, 3):
            raise Exception("Wrong shape for vertices")

        if self._f.shape != (self._nf, 3):
            raise Exception("Wrong shape for faces")

        # see if all faces have a corresponding node.
        # actually this would not invalidate the surface, so
        # we only give a warning
        unqf = np.unique(self._f)
        if unqf.size != self._nv:
            from mvpa2.base import warning
            warning("Count mismatch for face range (%d!=%d), "
                            "faces without node: %r" % (unqf.size, self._nv,
                                    len(set(range(self._nv)) - set(unqf))))


        if np.any(unqf != np.arange(self._nv)):
            from mvpa2.base import warning
            warning("Missing values in faces")
Exemple #13
0
    def test_confusion_based_error(self, l_clf):
        train = datasets['uni2medium']
        train = train[train.sa.train == 1]
        # to check if we fail to classify for 3 labels
        test3 = datasets['uni3medium']
        test3 = test3[test3.sa.train == 1]
        err = ConfusionBasedError(clf=l_clf)
        terr = TransferMeasure(l_clf,
                               Splitter('train', attr_values=[1, 1]),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'))

        self.assertRaises(UnknownStateError, err, None)
        """Shouldn't be able to access the state yet"""

        l_clf.train(train)
        e, te = err(None), terr(train)
        te = np.asscalar(te)
        self.assertTrue(
            abs(e - te) < 1e-10,
            msg="ConfusionBasedError (%.2g) should be equal to TransferError "
            "(%.2g) on traindataset" % (e, te))

        # this will print nasty WARNING but it is ok -- it is just checking code
        # NB warnings are not printed while doing whole testing
        warning("Don't worry about the following warning.")
        if 'multiclass' in l_clf.__tags__:
            self.assertFalse(terr(test3) is None)

        # try copying the beast
        terr_copy = copy(terr)
Exemple #14
0
    def handle_arg(arg):
        """Helper which would read in SpatialImage if necessary
        """
        if arg is None:
            return arg
        if isinstance(arg, basestring):
            arg = nb.load(arg)
            argshape = arg.shape
            # Assure that we have 3D (at least)
            if len(argshape)<3:
                arg = nb.Nifti1Image(
                        arg.get_data().reshape(argshape + (1,)*(3-len(argshape))),
                        arg.affine, arg.header)
        else:
            argshape = arg.shape

        if len(argshape) == 4:
            if argshape[-1] > 1:
                warning("For now plot_lightbox can handle only 3d, 4d data was provided."
                        " Plotting only the first volume")
            if isinstance(arg, SpatialImage):
                arg = nb.Nifti1Image(arg.get_data()[..., 0], arg.affine, arg.header)
            else:
                arg = arg[..., 0]
        elif len(argshape) != 3:
            raise ValueError, "For now just handling 3D volumes"
        return arg
Exemple #15
0
 def _forward_data(self, data):
     params = self.params
     try:
         mapped = filtfilt(self.__iir_num,
                           self.__iir_denom,
                           data,
                           axis=params.axis,
                           padtype=params.padtype,
                           padlen=params.padlen)
     except TypeError:
         # we have an ancient scipy, do manually
         # but is will only support 2d arrays
         if params.axis == 0:
             data = data.T
         if params.axis > 1:
             raise ValueError("this version of scipy does not "
                              "support nd-arrays for filtfilt()")
         if not (params['padlen'].is_default and params['padtype'].is_default):
             warning("this version of scipy.signal.filtfilt() does not "
                     "support `padlen` and `padtype` arguments -- ignoring "
                     "them")
         mapped = [filtfilt(self.__iir_num,
                            self.__iir_denom,
                            x)
                 for x in data]
         mapped = np.array(mapped)
         if params.axis == 0:
             mapped = mapped.T
     return mapped
Exemple #16
0
def _verified_reverse1(mapper, onesample):
    """Replacement of Mapper.reverse1 with safety net

    This function can be called instead of a direct call to a mapper's
    ``reverse1()``. It wraps a single sample into a dummy axis and calls
    ``reverse()``. Afterwards it verifies that the first axis of the
    returned array has one item only, otherwise it will issue a warning.
    This function is useful in any context where it is critical to ensure
    that reverse mapping a single sample, yields exactly one sample -- which
    isn't guaranteed due to the flexible nature of mappers.

    Parameters
    ----------
    mapper : Mapper instance
    onesample : array-like
      Single sample (in terms of the supplied mapper).

    Returns
    -------
    array
      Shape matches a single sample in terms of the mappers input space.
    """
    dummy_axis_sample = np.asanyarray(onesample)[None]
    rsample = mapper.reverse(dummy_axis_sample)
    if not len(rsample) == 1:
        warning("Reverse mapping single sample yielded multiple -- can lead to unintended behavior!")
    return rsample[0]
Exemple #17
0
    def _get_increments(self, ndim):
        """Creates a list of increments for a given dimensionality

        RF: lame yoh just cut-pasted and tuned up because everything
            depends on ndim...
        """
        # Set element_sizes
        element_sizes = self._element_sizes
        if element_sizes is None:
            element_sizes = np.ones(ndim)
        else:
            if (ndim != len(element_sizes)):
                raise ValueError, \
                      "Dimensionality mismatch: element_sizes %s provided " \
                      "to constructor had %i dimensions, whenever queried " \
                      "coordinate had %i" \
                      % (element_sizes, len(element_sizes), ndim)
        center = np.zeros(ndim)

        element_sizes = np.asanyarray(element_sizes)
        # What range for each dimension
        erange = np.ceil(self._radius / element_sizes).astype(int)

        tentative_increments = np.array(list(np.ndindex(tuple(erange*2 + 1)))) \
                               - erange
        # Filter out the ones beyond the "sphere"
        res = array([
            x for x in tentative_increments
            if self._inner_radius < self._distance_func(
                x * element_sizes, center) <= self._radius
        ])

        if not len(res):
            warning("%s defines no neighbors" % self)
        return res
Exemple #18
0
    def _get_increments(self, ndim):
        """Creates a list of increments for a given dimensionality

        RF: lame yoh just cut-pasted and tuned up because everything
            depends on ndim...
        """
        # Set element_sizes
        element_sizes = self._element_sizes
        if element_sizes is None:
            element_sizes = np.ones(ndim)
        else:
            if (ndim != len(element_sizes)):
                raise ValueError, \
                      "Dimensionality mismatch: element_sizes %s provided " \
                      "to constructor had %i dimensions, whenever queried " \
                      "coordinate had %i" \
                      % (element_sizes, len(element_sizes), ndim)
        center = np.zeros(ndim)

        element_sizes = np.asanyarray(element_sizes)
        # What range for each dimension
        erange = np.ceil(self._radius / element_sizes).astype(int)

        tentative_increments = np.array(list(np.ndindex(tuple(erange*2 + 1)))) \
                               - erange
        # Filter out the ones beyond the "sphere"
        res = array([x for x in tentative_increments
                      if self._inner_radius
                      < self._distance_func(x * element_sizes, center)
                      <= self._radius])

        if not len(res):
            warning("%s defines no neighbors" % self)
        return res
Exemple #19
0
    def __init__(self, **kwargs):
        """Initialize an SMLR classifier.
        """
        """
        TODO:
         # Add in likelihood calculation
         # Add kernels, not just direct methods.
         """
        # init base class first
        Classifier.__init__(self, **kwargs)

        if _cStepwiseRegression is None and self.params.implementation == 'C':
            warning('SMLR: C implementation is not available.'
                    ' Using pure Python one')
            self.params.implementation = 'Python'

        # pylint friendly initializations
        self._ulabels = None
        """Unigue labels from the training set."""
        self.__weights_all = None
        """Contains all weights including bias values"""
        self.__weights = None
        """Just the weights, without the biases"""
        self.__biases = None
        """The biases, will remain none if has_bias is False"""
Exemple #20
0
 def _forward_data(self, data):
     params = self.params
     try:
         mapped = filtfilt(self.__iir_num,
                           self.__iir_denom,
                           data,
                           axis=params.axis,
                           padtype=params.padtype,
                           padlen=params.padlen)
     except TypeError:
         # we have an ancient scipy, do manually
         # but is will only support 2d arrays
         if params.axis == 0:
             data = data.T
         if params.axis > 1:
             raise ValueError("this version of scipy does not "
                              "support nd-arrays for filtfilt()")
         if not (params['padlen'].is_default
                 and params['padtype'].is_default):
             warning("this version of scipy.signal.filtfilt() does not "
                     "support `padlen` and `padtype` arguments -- ignoring "
                     "them")
         mapped = [
             filtfilt(self.__iir_num, self.__iir_denom, x) for x in data
         ]
         mapped = np.array(mapped)
         if params.axis == 0:
             mapped = mapped.T
     return mapped
Exemple #21
0
    def test_confusion_based_error(self, l_clf):
        train = datasets['uni2medium']
        train = train[train.sa.train == 1]
        # to check if we fail to classify for 3 labels
        test3 = datasets['uni3medium']
        test3 = test3[test3.sa.train == 1]
        err = ConfusionBasedError(clf=l_clf)
        terr = TransferMeasure(l_clf, Splitter('train', attr_values=[1,1]),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'))

        self.assertRaises(UnknownStateError, err, None)
        """Shouldn't be able to access the state yet"""

        l_clf.train(train)
        e, te = err(None), terr(train)
        te = np.asscalar(te)
        self.assertTrue(abs(e-te) < 1e-10,
            msg="ConfusionBasedError (%.2g) should be equal to TransferError "
                "(%.2g) on traindataset" % (e, te))

        # this will print nasty WARNING but it is ok -- it is just checking code
        # NB warnings are not printed while doing whole testing
        warning("Don't worry about the following warning.")
        if 'multiclass' in l_clf.__tags__:
            self.assertFalse(terr(test3) is None)

        # try copying the beast
        terr_copy = copy(terr)
Exemple #22
0
    def append(self, other):
        """This method should not be used and will be removed in the future"""
        warning(
            "AttrDataset.append() is deprecated and will be removed. "
            "Instead of ds.append(x) use: ds = vstack((ds, x), a=0)"
        )

        if not self.nfeatures == other.nfeatures:
            raise DatasetError("Cannot merge datasets, because the number of " "features does not match.")

        if not sorted(self.sa.keys()) == sorted(other.sa.keys()):
            raise DatasetError(
                "Cannot merge dataset. This datasets samples "
                "attributes %s cannot be mapped into the other "
                "set %s" % (self.sa.keys(), other.sa.keys())
            )

        # concat the samples as well
        self.samples = np.concatenate((self.samples, other.samples), axis=0)

        # tell the collection the new desired length of all attributes
        self.sa.set_length_check(len(self.samples))
        # concat all samples attributes
        for k, v in other.sa.iteritems():
            self.sa[k].value = np.concatenate((self.sa[k].value, v.value), axis=0)
Exemple #23
0
    def label_voxel(self, c, levels=None):

        if self.__referenceLevel is None:
            warning("You did not provide what level to use "
                    "for reference. Assigning 0th level -- '%s'" %
                    (self._levels[0], ))
            self.set_reference_level(0)
            # return self.__referenceAtlas.label_voxel(c, levels)

        c = self._check_range(c)

        # obtain coordinates of the closest voxel
        cref = self._data[self.__referenceLevel.indexes, c[0], c[1], c[2]]
        dist = norm((cref - c) * self.voxdim)
        if __debug__:
            debug(
                'ATL__', "Closest referenced point for %r is "
                "%r at distance %3.2f" % (c, cref, dist))
        if (self.distance - dist) >= 1e-3:  # neglect everything smaller
            result = self.__referenceAtlas.label_voxel(cref, levels)
            result['voxel_referenced'] = c
            result['distance'] = dist
        else:
            result = self.__referenceAtlas.label_voxel(c, levels)
            if __debug__:
                debug(
                    'ATL__', "Closest referenced point is "
                    "further than desired distance %.2f" % self.distance)
            result['voxel_referenced'] = None
            result['distance'] = 0
        return result
Exemple #24
0
def _check_cosmo_dataset(cosmo):
    '''
    Helper function to ensure a cosmo input for cosmo_dataset is valid.
    Currently does two things:
    (1) raise an error if there are no samples
    (2) raise a warning if samples have very large or very small values. A use
        case is certain MEEG datasets with very small sample values
        (in the order of 1e-25) which affects some classifiers
    '''

    samples = cosmo.get('samples', None)

    if samples is None:
        raise KeyError("Missing field .samples in %s" % cosmo)

    # check for extreme values
    warn_for_extreme_values_decimals = 10

    # ignore NaNs and infinity
    nonzero_msk = np.logical_and(np.isfinite(samples), samples != 0)

    if np.any(nonzero_msk):
        max_nonzero = np.max(np.abs(samples[nonzero_msk]))

        # see how many decimals in the largest absolute value
        decimals_nonzero = np.log10(max_nonzero)

        if abs(decimals_nonzero) > warn_for_extreme_values_decimals:
            msg = (
                'Samples have extreme values, maximum absolute value is %s; '
                'This may affect some analyses. Considering scaling the samples, '
                'e.g. by a factor of 10**%d ' % (
                    max_nonzero, -decimals_nonzero))
            warning(msg)
Exemple #25
0
    def _predict(self, data):
        """Predict using the skl learner
        """
        try:
            res = self._skl_learner.predict(data)
        except Exception as e:
            raise FailedToPredictError("Failed to predict %s on data of shape %s. Got '%s' during" \
                  " call to predict()." % (self, data.shape, e))

        if self.enforce_dim:
            res_dim = len(res.shape)
            if res_dim > self.enforce_dim:
                # would throw meaningful exception if not possible
                res = res.reshape(res.shape[:self.enforce_dim])
            elif res_dim < self.enforce_dim:
                # broadcast
                res = res.reshape(res.shape + (1, ) *
                                  (self.enforce_dim - res_dim))
        # Estimate estimates after predict, so if something goes
        # wrong, above exception handling occurs
        if self.ca.is_enabled('probabilities'):
            if hasattr(self._skl_learner, 'predict_proba'):
                # Duplication of computation, since in many scenarios
                # predict() calls predict_proba()
                self.ca.probabilities = self._skl_learner.predict_proba(data)
            else:
                warning("%s has no predict_proba() defined, so no probability"
                        " estimates could be extracted" % self._skl_learner)
        self.ca.estimates = res
        return res
Exemple #26
0
def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
    """Helper function to return p-value(x) given cdf and tail

    Parameters
    ----------
    cdf_func : callable
      Function to be used to derive cdf values for x
    tail : str ('left', 'right', 'any', 'both')
      Which tail of the distribution to report. For 'any' and 'both'
      it chooses the tail it belongs to based on the comparison to
      p=0.5. In the case of 'any' significance is taken like in a
      one-tailed test.
    return_tails : bool
      If True, a tuple return (pvalues, tails), where tails contain
      1s if value was from the right tail, and 0 if the value was
      from the left tail.
    """
    is_scalar = np.isscalar(x)
    if is_scalar:
        x = [x]

    cdf = cdf_func(x)

    if __debug__ and "CHECK_STABILITY" in debug.active:
        cdf_min, cdf_max = np.min(cdf), np.max(cdf)
        if cdf_min < 0 or cdf_max > 1.0:
            s = ("", " for %s" % name)[int(name is not None)]
            warning("Stability check of cdf %s failed%s. Min=%s, max=%s" % (cdf_func, s, cdf_min, cdf_max))

    # no escape but to assure that CDF is in the right range. Some
    # distributions from scipy tend to jump away from [0,1]
    cdf = np.clip(cdf, 0, 1.0)

    if tail == "left":
        if return_tails:
            right_tail = np.zeros(cdf.shape, dtype=bool)
    elif tail == "right":
        cdf = 1 - cdf
        if return_tails:
            right_tail = np.ones(cdf.shape, dtype=bool)
    elif tail in ("any", "both"):
        right_tail = cdf >= 0.5
        cdf[right_tail] = 1.0 - cdf[right_tail]
        if tail == "both":
            # we need report the area under both tails
            # XXX this is only meaningful for symetric distributions
            cdf *= 2

    # Assure that NaNs didn't get significant value
    cdf[np.isnan(x)] = 1.0
    if is_scalar:
        res = cdf[0]
    else:
        res = cdf

    if return_tails:
        return (res, right_tail)
    else:
        return res
Exemple #27
0
    def train(self, ds):
        """
        The default implementation calls ``_pretrain()``, ``_train()``, and
        finally ``_posttrain()``.

        Parameters
        ----------
        ds: Dataset
          Training dataset.

        Returns
        -------
        None
        """
        got_ds = is_datasetlike(ds)

        # TODO remove first condition if all Learners get only datasets
        if got_ds and (ds.nfeatures == 0 or len(ds) == 0):
            raise DegenerateInputError(
                "Cannot train learner on degenerate data %s" % ds)
        if __debug__:
            debug(
                "LRN",
                "Training learner %(lrn)s on dataset %(dataset)s",
                msgargs={'lrn': self, 'dataset': ds})

        self._pretrain(ds)

        # remember the time when started training
        t0 = time.time()

        if got_ds:
            # things might have happened during pretraining
            if ds.nfeatures > 0:
                self._train(ds)
            else:
                warning("Trying to train on dataset with no features present")
                if __debug__:
                    debug("LRN",
                          "No features present for training, no actual training "
                          "is called")
        else:
            # in this case we claim to have no idea and simply try to train
            self._train(ds)

        # store timing
        self.ca.training_time = time.time() - t0

        # and post-proc
        self._posttrain(ds)

        # finally flag as trained
        self._set_trained()

        if __debug__:
            debug(
                "LRN",
                "Finished training learner %(lrn)s on dataset %(dataset)s",
                msgargs={'lrn': self, 'dataset': ds})
Exemple #28
0
    def __init__(self, generator, queryengine, errorfx=mean_mismatch_error,
                 indexsum=None,
                 reuse_neighbors=False,
                 splitter=None,
                 **kwargs):
        """Initialize the base class for "naive" searchlight classifiers

        Parameters
        ----------
        generator : `Generator`
          Some `Generator` to prepare partitions for cross-validation.
          It must not change "targets", thus e.g. no AttributePermutator's
        errorfx : func, optional
          Functor that computes a scalar error value from the vectors of
          desired and predicted values (e.g. subclass of `ErrorFunction`).
        indexsum : ('sparse', 'fancy'), optional
          What use to compute sums over arbitrary columns.  'fancy'
          corresponds to regular fancy indexing over columns, whenever
          in 'sparse', product of sparse matrices is used (usually
          faster, so is default if `scipy` is available).
        reuse_neighbors : bool, optional
          Compute neighbors information only once, thus allowing for
          efficient reuse on subsequent calls where dataset's feature
          attributes remain the same (e.g. during permutation testing)
        splitter : Splitter, optional
          Which will be used to split partitioned datasets.  If None specified
          then standard one operating on partitions will be used
        """

        # init base class first
        BaseSearchlight.__init__(self, queryengine, **kwargs)

        self._errorfx = errorfx
        self._generator = generator
        self._splitter = splitter

        # TODO: move into _call since resetting over default None
        #       obscures __repr__
        if indexsum is None:
            if externals.exists('scipy'):
                indexsum = 'sparse'
            else:
                indexsum = 'fancy'
        else:
            if indexsum == 'sparse' and not externals.exists('scipy'):
                warning("Scipy.sparse isn't available so taking 'fancy' as "
                        "'indexsum' method.")
                indexsum = 'fancy'
        self._indexsum = indexsum

        if not self.nproc in (None, 1):
            raise NotImplementedError, "For now only nproc=1 (or None for " \
                  "autodetection) is supported by GNBSearchlight"

        self.__pb = None            # statistics per each block/label
        self.__reuse_neighbors = reuse_neighbors

        # Storage to be used for neighborhood information
        self.__roi_fids = None
Exemple #29
0
    def _level3(self, datasets):
        params = self.params            # for quicker access ;)
        # create a mapper per dataset
        mappers = [deepcopy(params.alignment) for ds in datasets]

        # key different from level-2; the common space is uniform
        #temp_commonspace = commonspace
        # Fixing nproc=0
        if params.nproc == 0:
            from mvpa2.base import warning
            warning("nproc of 0 doesn't make sense. Setting nproc to 1.")
            params.nproc = 1
        # Checking for joblib, if not, set nproc to 1
        if params.nproc != 1:
            from mvpa2.base import externals, warning
            if not externals.exists('joblib'):
                warning("Setting nproc different from 1 requires joblib package, which "
                        "does not seem to exist. Setting nproc to 1.")
                params.nproc = 1

        # start from original input datasets again
        if params.nproc == 1:
            residuals = []
            for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
                if __debug__:
                    debug('HPAL_', "Level 3: ds #%i" % i)
                m, residual = get_trained_mapper(ds_new, self.commonspace, m,
                                                 self.ca['residual_errors'].enabled)
                if self.ca['residual_errors'].enabled:
                    residuals.append(residual)
        else:
            if __debug__:
                debug('HPAL_', "Level 3: Using joblib with nproc = %d " % params.nproc)
            verbose_level_parallel = 20 \
                if (__debug__ and 'HPAL' in debug.active) else 0
            from joblib import Parallel, delayed
            import sys
            # joblib's 'multiprocessing' backend has known issues of failure on OSX
            # Tested with MacOS 10.12.13, python 2.7.13, joblib v0.10.3
            if params.joblib_backend is None:
                params.joblib_backend = 'threading' if sys.platform == 'darwin' \
                                        else 'multiprocessing'
            res = Parallel(
                    n_jobs=params.nproc, pre_dispatch=params.nproc,
                    backend=params.joblib_backend,
                    verbose=verbose_level_parallel
                    )(
                        delayed(get_trained_mapper)
                        (ds, self.commonspace, mapper, self.ca['residual_errors'].enabled)
                        for ds, mapper in zip(datasets, mappers)
                    )
            mappers = [m for m, r in res]
            if self.ca['residual_errors'].enabled:
                residuals = [r for m, r in res]

        if self.ca['residual_errors'].enabled:
            self.ca.residual_errors = Dataset(samples=np.array(residuals)[None, :])

        return mappers
Exemple #30
0
def seed(random_seed):
    if __debug__:
        debug('SG', "Seeding shogun's RNG with %s" % random_seed)
    try:
        # reuse the same seed for shogun
        shogun.Library.Math_init_random(random_seed)
    except Exception, e:
        warning('Shogun cannot be seeded due to %s' % (e,))
Exemple #31
0
 def corr_error_prob(predicted, target):
     """Computes p-value of correlation between the target and the predicted
     values.
     """
     from mvpa2.base import warning
     warning("p-value for correlation is implemented only when scipy is "
             "available. Bogus value -1.0 is returned otherwise")
     return -1.0
Exemple #32
0
 def corr_error_prob(predicted, target):
     """Computes p-value of correlation between the target and the predicted
     values.
     """
     from mvpa2.base import warning
     warning("p-value for correlation is implemented only when scipy is "
             "available. Bogus value -1.0 is returned otherwise")
     return -1.0
Exemple #33
0
def _extract_boxcar_events(ds, events=None, time_attr=None, match="prev", eprefix="event", event_mapper=None):
    """see eventrelated_dataset() for docs"""
    # relabel argument
    conv_strategy = {"prev": "floor", "next": "ceil", "closest": "round"}[match]

    if not time_attr is None:
        tvec = ds.sa[time_attr].value
        # we are asked to convert onset time into sample ids
        descr_events = []
        for ev in events:
            # do not mess with the input data
            ev = copy.deepcopy(ev)
            # best matching sample
            idx = value2idx(ev["onset"], tvec, conv_strategy)
            # store offset of sample time and real onset
            ev["orig_offset"] = ev["onset"] - tvec[idx]
            # rescue the real onset into a new attribute
            ev["orig_onset"] = ev["onset"]
            ev["orig_duration"] = ev["duration"]
            # figure out how many samples we need
            ev["duration"] = len(tvec[idx:][tvec[idx:] < ev["onset"] + ev["duration"]])
            # new onset is sample index
            ev["onset"] = idx
            descr_events.append(ev)
    else:
        descr_events = events
    # convert the event specs into the format expected by BoxcarMapper
    # take the first event as an example of contained keys
    evvars = _events2dict(descr_events)
    # checks
    for p in ["onset", "duration"]:
        if not p in evvars:
            raise ValueError("'%s' is a required property for all events." % p)
    boxlength = max(evvars["duration"])
    if __debug__:
        if not max(evvars["duration"]) == min(evvars["duration"]):
            warning("Boxcar mapper will use maximum boxlength (%i) of all " "provided Events." % boxlength)

    # finally create, train und use the boxcar mapper
    bcm = BoxcarMapper(evvars["onset"], boxlength, space=eprefix)
    bcm.train(ds)
    ds = ds.get_mapped(bcm)
    if event_mapper is None:
        # at last reflatten the dataset
        # could we add some meaningful attribute during this mapping, i.e. would
        # assigning 'inspace' do something good?
        ds = ds.get_mapped(FlattenMapper(shape=ds.samples.shape[1:]))
    else:
        ds = ds.get_mapped(event_mapper)
    # add samples attributes for the events, simply dump everything as a samples
    # attribute
    # special case onset and duration in case of conversion into descrete time
    if not time_attr is None:
        for attr in ("onset", "duration"):
            evvars[attr] = [e[attr] for e in events]
    ds = _evvars2ds(ds, evvars, eprefix)

    return ds
Exemple #34
0
    def _call(self, dataset):
        """Perform the ROI search.
        """
        # local binding
        nproc = self.nproc

        if nproc is None and externals.exists('pprocess'):
            import pprocess
            try:
                nproc = pprocess.get_number_of_cores() or 1
            except AttributeError:
                warning("pprocess version %s has no API to figure out maximal "
                        "number of cores. Using 1"
                        % externals.versions['pprocess'])
                nproc = 1
        # train the queryengine
        self._queryengine.train(dataset)

        # decide whether to run on all possible center coords or just a provided
        # subset
        if isinstance(self.__roi_ids, str):
            roi_ids = dataset.fa[self.__roi_ids].value.nonzero()[0]
        elif self.__roi_ids is not None:
            roi_ids = self.__roi_ids
            # safeguard against stupidity
            if __debug__:
                if max(roi_ids) >= dataset.nfeatures:
                    raise IndexError, \
                          "Maximal center_id found is %s whenever given " \
                          "dataset has only %d features" \
                          % (max(roi_ids), dataset.nfeatures)
        else:
            roi_ids = np.arange(dataset.nfeatures)

        # pass to subclass
        results, roi_sizes = self._sl_call(dataset, roi_ids, nproc)

        if not roi_sizes is None:
            self.ca.roi_sizes = roi_sizes

        if 'mapper' in dataset.a:
            # since we know the space we can stick the original mapper into the
            # results as well
            if self.__roi_ids is None:
                results.a['mapper'] = copy.copy(dataset.a.mapper)
            else:
                # there is an additional selection step that needs to be
                # expressed by another mapper
                mapper = copy.copy(dataset.a.mapper)
                mapper.append(StaticFeatureSelection(roi_ids,
                                                     dshape=dataset.shape[1:]))
                results.a['mapper'] = mapper

        # charge state
        self.ca.raw_results = results

        # return raw results, base-class will take care of transformations
        return results
Exemple #35
0
    def _forward_dataset(self, ds):
        # local binding
        chunks_attr = self.__chunks_attr
        dtype = self.__dtype

        if __debug__ and chunks_attr is not None:
            nsamples_per_chunk = get_nsamples_per_attr(ds, chunks_attr)
            min_nsamples_per_chunk = np.min(nsamples_per_chunk.values())
            if min_nsamples_per_chunk in range(3, 6):
                warning(
                    "Z-scoring chunk-wise having a chunk with only "
                    "%d samples is 'discouraged'. "
                    "You have chunks with following number of samples: %s" % (
                        min_nsamples_per_chunk,
                        nsamples_per_chunk,
                    ))
            if min_nsamples_per_chunk <= 2:
                warning(
                    "Z-scoring chunk-wise having a chunk with less "
                    "than three samples will set features in these "
                    "samples to either zero (with 1 sample in a chunk) "
                    "or -1/+1 (with 2 samples in a chunk). "
                    "You have chunks with following number of samples: %s" %
                    (nsamples_per_chunk, ))

        params = self.__params_dict
        if params is None:
            raise RuntimeError, \
                  "ZScoreMapper needs to be trained before call to forward"

        if self._secret_inplace_zscore:
            mds = ds
        else:
            # shallow copy to put the new stuff in
            mds = ds.copy(deep=False)
            # but deepcopy the samples since _zscore would modify inplace
            mds.samples = mds.samples.copy()

        # cast the data to float, since in-place operations below do not upcast!
        if np.issubdtype(mds.samples.dtype, np.integer):
            mds.samples = mds.samples.astype(dtype)

        if '__all__' in params:
            # we have a global parameter set
            mds.samples = self._zscore(mds.samples, *params['__all__'])
        else:
            # per chunk z-scoring
            for c in mds.sa[chunks_attr].unique:
                if not c in params:
                    raise RuntimeError(
                        "%s has no parameters for chunk '%s'. It probably "
                        "wasn't present in the training dataset!?" %
                        (self.__class__.__name__, c))
                slicer = np.where(mds.sa[chunks_attr].value == c)[0]
                mds.samples[slicer] = self._zscore(mds.samples[slicer],
                                                   *params[c])

        return mds
Exemple #36
0
    def _predict(self, data):
        """Predict values for the data
        """
        # libsvm needs doubles
        src = _data2ls(data)
        ca = self.ca

        predictions = [ self.model.predict(p) for p in src ]

        if ca.is_enabled('estimates'):
            if self.__is_regression__:
                estimates = [ self.model.predict_values_raw(p)[0] for p in src ]
            else:
                # if 'trained_targets' are literal they have to be mapped
                if ( np.issubdtype(self.ca.trained_targets.dtype, 'c') or
                     np.issubdtype(self.ca.trained_targets.dtype, 'U') ):
                    trained_targets = self._attrmap.to_numeric(
                            self.ca.trained_targets)
                else:
                    trained_targets = self.ca.trained_targets
                nlabels = len(trained_targets)
                # XXX We do duplicate work. model.predict calls
                # predict_values_raw internally and then does voting or
                # thresholding. So if speed becomes a factor we might
                # want to move out logic from libsvm over here to base
                # predictions on obtined values, or adjust libsvm to
                # spit out values from predict() as well
                if nlabels == 2:
                    # Apperently libsvm reorders labels so we need to
                    # track (1,0) values instead of (0,1) thus just
                    # lets take negative reverse
                    estimates = [ self.model.predict_values(p)[(trained_targets[1],
                                                            trained_targets[0])]
                               for p in src ]
                    if len(estimates) > 0:
                        if __debug__:
                            debug("SVM",
                                  "Forcing estimates to be ndarray and reshaping"
                                  " them into 1D vector")
                        estimates = np.asarray(estimates).reshape(len(estimates))
                else:
                    # In multiclass we return dictionary for all pairs
                    # of labels, since libsvm does 1-vs-1 pairs
                    estimates = [ self.model.predict_values(p) for p in src ]
            ca.estimates = estimates

        if ca.is_enabled("probabilities"):
            # XXX Is this really necesssary? yoh don't think so since
            # assignment to ca is doing the same
            #self.probabilities = [ self.model.predict_probability(p)
            #                       for p in src ]
            try:
                ca.probabilities = [ self.model.predict_probability(p)
                                         for p in src ]
            except TypeError:
                warning("Current SVM %s doesn't support probability " %
                        self + " estimation.")
        return predictions
Exemple #37
0
def run(args):
    from mvpa2.base.hdf5 import h5save
    ds = None
    if not args.txt_data is None:
        verbose(1, "Load data from TXT file '%s'" % args.txt_data)
        samples = _load_from_txt(args.txt_data)
        ds = Dataset(samples)
    elif not args.npy_data is None:
        verbose(1, "Load data from NPY file '%s'" % args.npy_data)
        samples = _load_from_npy(args.npy_data)
        ds = Dataset(samples)
    elif not args.mri_data is None:
        verbose(1, "Load data from MRI image(s) %s" % args.mri_data)
        from mvpa2.datasets.mri import fmri_dataset
        vol_attr = dict()
        if not args.add_vol_attr is None:
            # XXX add a way to use the mapper of an existing dataset to
            # add a volume attribute without having to load the entire
            # mri data again
            vol_attr = dict(args.add_vol_attr)
            if not len(args.add_vol_attr) == len(vol_attr):
                warning("--vol-attr option with duplicate attribute name: "
                        "check arguments!")
            verbose(2, "Add volumetric feature attributes: %s" % vol_attr)
        ds = fmri_dataset(args.mri_data, mask=args.mask, add_fa=vol_attr)

    if ds is None:
        if args.data is None:
            raise RuntimeError('no data source specific')
        else:
            ds = hdf2ds(args.data)[0]
    else:
        if args.data is not None:
            verbose(
                1,
                'ignoring dataset input in favor of other data source -- remove either one to disambiguate'
            )

    # act on all attribute options
    ds = process_common_dsattr_opts(ds, args)

    if not args.add_fsl_mcpar is None:
        from mvpa2.misc.fsl.base import McFlirtParams
        mc_par = McFlirtParams(args.add_fsl_mcpar)
        for param in mc_par:
            verbose(
                2, "Add motion regressor as sample attribute '%s'" %
                ('mc_' + param))
            ds.sa['mc_' + param] = mc_par[param]

    verbose(3, "Dataset summary %s" % (ds.summary()))
    # and store
    outfilename = args.output
    if not outfilename.endswith('.hdf5'):
        outfilename += '.hdf5'
    verbose(1, "Save dataset to '%s'" % outfilename)
    h5save(outfilename, ds, mkdir=True, compression=args.hdf5_compression)
Exemple #38
0
    def __init__(self,
                 generator,
                 queryengine,
                 errorfx=mean_mismatch_error,
                 indexsum=None,
                 reuse_neighbors=False,
                 **kwargs):
        """Initialize the base class for "naive" searchlight classifiers

        Parameters
        ----------
        generator : `Generator`
          Some `Generator` to prepare partitions for cross-validation.
          It must not change "targets", thus e.g. no AttributePermutator's
        errorfx : func, optional
          Functor that computes a scalar error value from the vectors of
          desired and predicted values (e.g. subclass of `ErrorFunction`).
        indexsum : ('sparse', 'fancy'), optional
          What use to compute sums over arbitrary columns.  'fancy'
          corresponds to regular fancy indexing over columns, whenever
          in 'sparse', product of sparse matrices is used (usually
          faster, so is default if `scipy` is available).
        reuse_neighbors : bool, optional
          Compute neighbors information only once, thus allowing for
          efficient reuse on subsequent calls where dataset's feature
          attributes remain the same (e.g. during permutation testing)
        """

        # init base class first
        BaseSearchlight.__init__(self, queryengine, **kwargs)

        self._errorfx = errorfx
        self._generator = generator

        # TODO: move into _call since resetting over default None
        #       obscures __repr__
        if indexsum is None:
            if externals.exists('scipy'):
                indexsum = 'sparse'
            else:
                indexsum = 'fancy'
        else:
            if indexsum == 'sparse' and not externals.exists('scipy'):
                warning("Scipy.sparse isn't available so taking 'fancy' as "
                        "'indexsum' method.")
                indexsum = 'fancy'
        self._indexsum = indexsum

        if not self.nproc in (None, 1):
            raise NotImplementedError, "For now only nproc=1 (or None for " \
                  "autodetection) is supported by GNBSearchlight"

        self.__pb = None  # statistics per each block/label
        self.__reuse_neighbors = reuse_neighbors

        # Storage to be used for neighborhood information
        self.__roi_fids = None
Exemple #39
0
def Atlas(filename=None, name=None, *args, **kwargs):
    """A convinience factory for the atlases
    """
    if filename is None:
        if name is None:
            raise ValueError(
                "Please provide either path or name of the atlas to be used")
        atlaspath = KNOWN_ATLASES[name]
        filename = atlaspath % ({'name': name})
        if not os.path.exists(filename):
            raise IOError("File %s for atlas %s was not found" %
                          (filename, name))
    else:
        if name is not None:
            raise ValueError("Provide only filename or name")

    try:
        # Just to guestimate what atlas that is
        tempAtlas = XMLBasedAtlas(filename=filename,
                                  load_maps=False)  #, *args, **kwargs)
        version = tempAtlas.version
        atlas_source = None
        for cls in [PyMVPAAtlas, FSLAtlas]:
            if cls._check_version(version):
                atlas_source = cls.source
                break
        if atlas_source is None:
            if __debug__: debug('ATL_', "Unknown atlas " + filename)
            return tempAtlas

        atlasTypes = {
            'PyMVPA': {
                "Label": LabelsAtlas,
                "Reference": ReferencesAtlas
            },
            'FSL': {
                "Label": FSLLabelsAtlas,
                "Probabalistic": FSLProbabilisticAtlas,
                "Probabilistic": FSLProbabilisticAtlas,
            }
        }[atlas_source]
        atlasType = tempAtlas.header.type.text
        if atlasType in atlasTypes:
            if __debug__: debug('ATL_', "Creating %s Atlas" % atlasType)
            return atlasTypes[atlasType](filename=filename, *args, **kwargs)
            #return ReferencesAtlas(filename)
        else:
            warning(
                "Unknown %s type '%s' of atlas in %s."
                " Known are %s" %
                (atlas_source, atlasType, filename, list(atlasTypes.keys())),
                2)
            return tempAtlas
    except XMLAtlasException as e:
        print("File %s is not a valid XML based atlas due to %s" \
              % (filename, repr(e)))
        raise e
Exemple #40
0
    def _set_retrainable(self, value, force=False):
        """Assign value of retrainable parameter

        If retrainable flag is to be changed, classifier has to be
        untrained.  Also internal attributes such as _changedData,
        __changedData_isset, and __idhashes should be initialized if
        it becomes retrainable
        """
        pretrainable = self.params["retrainable"]
        if (force or value != pretrainable.value) and "retrainable" in self.__tags__:
            if __debug__:
                debug("CLF_", "Setting retrainable to %s" % value)
            if "meta" in self.__tags__:
                warning(
                    "Retrainability is not yet crafted/tested for "
                    "meta classifiers. Unpredictable behavior might occur"
                )
            # assure that we don't drag anything behind
            if self.trained:
                self.untrain()
            ca = self.ca
            if not value and ca.has_key("retrained"):
                ca.pop("retrained")
                ca.pop("repredicted")
            if value:
                if not "retrainable" in self.__tags__:
                    warning(
                        "Setting of flag retrainable for %s has no effect"
                        " since classifier has no such capability. It would"
                        " just lead to resources consumption and slowdown" % self
                    )
                ca["retrained"] = ConditionalAttribute(enabled=True, doc="Either retrainable classifier was retrained")
                ca["repredicted"] = ConditionalAttribute(
                    enabled=True, doc="Either retrainable classifier was repredicted"
                )

            pretrainable.value = value

            # if retrainable we need to keep track of things
            if value:
                self.__idhashes = {"traindata": None, "targets": None, "testdata": None}  # , 'testtraindata': None}
                if __debug__ and "CHECK_RETRAIN" in debug.active:
                    # ??? it is not clear though if idhash is faster than
                    # simple comparison of (dataset != __traineddataset).any(),
                    # but if we like to get rid of __traineddataset then we
                    # should use idhash anyways
                    self.__trained = self.__idhashes.copy()  # just same Nones
                self.__reset_changed_data()
                self.__invalidatedChangedData = {}
            elif "retrainable" in self.__tags__:
                # self.__reset_changed_data()
                self.__changedData_isset = False
                self._changedData = None
                self.__idhashes = None
                if __debug__ and "CHECK_RETRAIN" in debug.active:
                    self.__trained = None
Exemple #41
0
    def __init__(self, space='targets', **kwargs):
        ProjectionMapper.__init__(self, space=space, **kwargs)

        self._scale = None
        """Estimated scale"""
        if self.params.svd == 'dgesvd' and not externals.exists('liblapack.so'):
            warning("Reverting choice of svd for ProcrusteanMapper to be default "
                    "'numpy' since liblapack.so seems not to be available for "
                    "'dgesvd'")
            self.params.svd = 'numpy'
Exemple #42
0
    def __init__(self, space='targets', **kwargs):
        ProjectionMapper.__init__(self, space=space, **kwargs)

        self._scale = None
        """Estimated scale"""
        if self.params.svd == 'dgesvd' and not externals.exists('liblapack.so'):
            warning("Reverting choice of svd for ProcrusteanMapper to be default "
                    "'numpy' since liblapack.so seems not to be available for "
                    "'dgesvd'")
            self.params.svd = 'numpy'
Exemple #43
0
    def _set_retrainable(self, value, force=False):
        """Assign value of retrainable parameter

        If retrainable flag is to be changed, classifier has to be
        untrained.  Also internal attributes such as _changedData,
        __changedData_isset, and __idhashes should be initialized if
        it becomes retrainable
        """
        pretrainable = self.params['retrainable']
        if (force or value != pretrainable.value) \
               and 'retrainable' in self.__tags__:
            if __debug__:
                debug("CLF_", "Setting retrainable to %s" % value)
            if 'meta' in self.__tags__:
                warning("Retrainability is not yet crafted/tested for "
                        "meta classifiers. Unpredictable behavior might occur")
            # assure that we don't drag anything behind
            if self.trained:
                self.untrain()
            ca = self.ca
            if not value and ca.has_key('retrained'):
                ca.pop('retrained')
                ca.pop('repredicted')
            if value:
                if not 'retrainable' in self.__tags__:
                    warning("Setting of flag retrainable for %s has no effect"
                            " since classifier has no such capability. It would"
                            " just lead to resources consumption and slowdown"
                            % self)
                ca['retrained'] = ConditionalAttribute(enabled=True,
                        doc="Either retrainable classifier was retrained")
                ca['repredicted'] = ConditionalAttribute(enabled=True,
                        doc="Either retrainable classifier was repredicted")

            pretrainable.value = value

            # if retrainable we need to keep track of things
            if value:
                self.__idhashes = {'traindata': None, 'targets': None,
                                   'testdata': None} #, 'testtraindata': None}
                if __debug__ and 'CHECK_RETRAIN' in debug.active:
                    # ??? it is not clear though if idhash is faster than
                    # simple comparison of (dataset != __traineddataset).any(),
                    # but if we like to get rid of __traineddataset then we
                    # should use idhash anyways
                    self.__trained = self.__idhashes.copy() # just same Nones
                self.__reset_changed_data()
                self.__invalidatedChangedData = {}
            elif 'retrainable' in self.__tags__:
                #self.__reset_changed_data()
                self.__changedData_isset = False
                self._changedData = None
                self.__idhashes = None
                if __debug__ and 'CHECK_RETRAIN' in debug.active:
                    self.__trained = None
Exemple #44
0
    def fit(self, measure, ds):
        """Fit the distribution by performing multiple cycles which repeatedly
        permuted labels in the training dataset.

        Parameters
        ----------
        measure: Measure or None
          A measure used to compute the results from shuffled data. Can be None
          if a measure instance has been provided to the constructor.
        ds: `Dataset` which gets permuted and used to compute the
          measure/transfer error multiple times.
        """
        # TODO: place exceptions separately so we could avoid circular imports
        from mvpa2.base.learner import LearnerError

        # prefer the already assigned measure over anything the was passed to
        # the function.
        # XXX that is a bit awkward but is necessary to keep the code changes
        # in the rest of PyMVPA minimal till this behavior become mandatory
        if not self._measure is None:
            measure = self._measure
            measure.untrain()

        dist_samples = []
        """Holds the values for randomized labels."""

        # estimate null-distribution
        # TODO this really needs to be more clever! If data samples are
        # shuffled within a class it really makes no difference for the
        # classifier, hence the number of permutations to estimate the
        # null-distribution of transfer errors can be reduced dramatically
        # when the *right* permutations (the ones that matter) are done.
        skipped = 0  # # of skipped permutations
        for p, permuted_ds in enumerate(self.__permutator.generate(ds)):
            # new permutation all the time
            # but only permute the training data and keep the testdata constant
            #
            if __debug__:
                debug("STATMC", "Doing %i permutations: %i" % (self.__permutator.nruns, p + 1), cr=True)

            # compute and store the measure of this permutation
            # assume it has `TransferError` interface
            try:
                res = measure(permuted_ds)
                dist_samples.append(res.samples)
            except LearnerError, e:
                if __debug__:
                    debug("STATMC", " skipped", cr=True)
                warning(
                    "Failed to obtain value from %s due to %s.  Measurement"
                    " was skipped, which could lead to unstable and/or"
                    " incorrect assessment of the null_dist" % (measure, e)
                )
                skipped += 1
                continue
Exemple #45
0
    def _forward_dataset(self, ds):
        # local binding
        chunks_attr = self.__chunks_attr
        dtype = self.__dtype

        if __debug__ and not chunks_attr is None:
            nsamples_per_chunk = get_nsamples_per_attr(ds, chunks_attr)
            min_nsamples_per_chunk = np.min(nsamples_per_chunk.values())
            if min_nsamples_per_chunk in range(3, 6):
                warning(
                    "Z-scoring chunk-wise having a chunk with only "
                    "%d samples is 'discouraged'. "
                    "You have chunks with following number of samples: %s"
                    % (min_nsamples_per_chunk, nsamples_per_chunk)
                )
            if min_nsamples_per_chunk <= 2:
                warning(
                    "Z-scoring chunk-wise having a chunk with less "
                    "than three samples will set features in these "
                    "samples to either zero (with 1 sample in a chunk) "
                    "or -1/+1 (with 2 samples in a chunk). "
                    "You have chunks with following number of samples: %s" % (nsamples_per_chunk,)
                )

        params = self.__params_dict
        if params is None:
            raise RuntimeError, "ZScoreMapper needs to be trained before call to forward"

        if self._secret_inplace_zscore:
            mds = ds
        else:
            # shallow copy to put the new stuff in
            mds = ds.copy(deep=False)
            # but deepcopy the samples since _zscore would modify inplace
            mds.samples = mds.samples.copy()

        # cast the data to float, since in-place operations below do not upcast!
        if np.issubdtype(mds.samples.dtype, np.integer):
            mds.samples = mds.samples.astype(dtype)

        if "__all__" in params:
            # we have a global parameter set
            mds.samples = self._zscore(mds.samples, *params["__all__"])
        else:
            # per chunk z-scoring
            for c in mds.sa[chunks_attr].unique:
                if not c in params:
                    raise RuntimeError(
                        "%s has no parameters for chunk '%s'. It probably "
                        "wasn't present in the training dataset!?" % (self.__class__.__name__, c)
                    )
                slicer = np.where(mds.sa[chunks_attr].value == c)[0]
                mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c])

        return mds
Exemple #46
0
    def fit(self, measure, ds):
        """Fit the distribution by performing multiple cycles which repeatedly
        permuted labels in the training dataset.

        Parameters
        ----------
        measure: Measure or None
          A measure used to compute the results from shuffled data. Can be None
          if a measure instance has been provided to the constructor.
        ds: `Dataset` which gets permuted and used to compute the
          measure/transfer error multiple times.
        """
        # TODO: place exceptions separately so we could avoid circular imports
        from mvpa2.base.learner import LearnerError

        # prefer the already assigned measure over anything the was passed to
        # the function.
        # XXX that is a bit awkward but is necessary to keep the code changes
        # in the rest of PyMVPA minimal till this behavior become mandatory
        if not self._measure is None:
            measure = self._measure
            measure.untrain()

        dist_samples = []
        """Holds the values for randomized labels."""

        # estimate null-distribution
        # TODO this really needs to be more clever! If data samples are
        # shuffled within a class it really makes no difference for the
        # classifier, hence the number of permutations to estimate the
        # null-distribution of transfer errors can be reduced dramatically
        # when the *right* permutations (the ones that matter) are done.
        skipped = 0  # # of skipped permutations
        for p, permuted_ds in enumerate(self.__permutator.generate(ds)):
            # new permutation all the time
            # but only permute the training data and keep the testdata constant
            #
            if __debug__:
                debug('STATMC', "Doing %i permutations: %i" \
                      % (self.__permutator.count, p+1), cr=True)

            # compute and store the measure of this permutation
            # assume it has `TransferError` interface
            try:
                res = measure(permuted_ds)
                dist_samples.append(res.samples)
            except LearnerError, e:
                if __debug__:
                    debug('STATMC', " skipped", cr=True)
                warning(
                    'Failed to obtain value from %s due to %s.  Measurement'
                    ' was skipped, which could lead to unstable and/or'
                    ' incorrect assessment of the null_dist' % (measure, e))
                skipped += 1
                continue
Exemple #47
0
    def _call(self, dataset):
        """Perform the ROI search.
        """
        # local binding
        nproc = self.nproc

        if nproc is None and externals.exists('pprocess'):
            import pprocess
            try:
                nproc = pprocess.get_number_of_cores() or 1
            except AttributeError:
                warning("pprocess version %s has no API to figure out maximal "
                        "number of cores. Using 1" %
                        externals.versions['pprocess'])
                nproc = 1
        # train the queryengine
        self._queryengine.train(dataset)

        # decide whether to run on all possible center coords or just a provided
        # subset
        if isinstance(self.__roi_ids, str):
            roi_ids = dataset.fa[self.__roi_ids].value.nonzero()[0]
        elif self.__roi_ids is not None:
            roi_ids = self.__roi_ids
            # safeguard against stupidity
            if __debug__:
                if max(roi_ids) >= dataset.nfeatures:
                    raise IndexError, \
                          "Maximal center_id found is %s whenever given " \
                          "dataset has only %d features" \
                          % (max(roi_ids), dataset.nfeatures)
        else:
            roi_ids = np.arange(dataset.nfeatures)

        # pass to subclass
        results = self._sl_call(dataset, roi_ids, nproc)

        if 'mapper' in dataset.a:
            # since we know the space we can stick the original mapper into the
            # results as well
            if self.__roi_ids is None:
                results.a['mapper'] = copy.copy(dataset.a.mapper)
            else:
                # there is an additional selection step that needs to be
                # expressed by another mapper
                mapper = copy.copy(dataset.a.mapper)
                mapper.append(
                    StaticFeatureSelection(roi_ids, dshape=dataset.shape[1:]))
                results.a['mapper'] = mapper

        # charge state
        self.ca.raw_results = results

        # return raw results, base-class will take care of transformations
        return results
Exemple #48
0
    def get_bold_run_model(self, model, subj, run):
        """Return the stimulation design for a particular subject/task/run.

        Parameters
        ----------
        model : int
          Model identifier.
        subj : int
          Subject identifier.
        run : int
          Run ID.

        Returns
        -------
        list
          One item per event in the run. All items are dictionaries with the
          following keys: 'condition', 'onset', 'duration', 'intensity',
          'run', 'task', 'trial_idx', 'ctrial_idx', where the first is a
          literal label, the last four are integer IDs, and the rest are
          typically floating point values. 'onset_idx' is the index of the
          event specification sorted by time across the entire run (typically
          corresponding to a trial index), 'conset_idx' is analog but contains
          the onset index per condition, i.e. the nth trial of the respective
          condition in a run.
        """

        conditions = self.get_model_conditions(model)
        events = []
        ev_fields = ('onset', 'duration', 'intensity')

        # get onset info for specific subject/task/run combo
        for cond in conditions:
            task_id = cond['task']
            try:
                evdata = np.atleast_1d(
                    self._load_model_task_run_onsets(
                        subj, model, task_id, run, cond['id']))
            except IOError:
                warning("onset definition file not found; no information "
                        "about condition '%s' for run %i"
                        % (cond['name'], run))
                continue
            for i, ev in enumerate(evdata):
                evdict = dict(zip(ev_fields,
                                  [ev[field] for field in ev_fields]))
                evdict['task'] = task_id
                evdict['condition'] = cond['name']
                evdict['run'] = run
                evdict['conset_idx'] = i
                events.append(evdict)
        events = sorted(events, key=lambda x: x['onset'])
        for i, ev in enumerate(events):
            ev['onset_idx'] = i
        return events
Exemple #49
0
    def get_bold_run_model(self, model, subj, run):
        """Return the stimulation design for a particular subject/task/run.

        Parameters
        ----------
        model : int
          Model identifier.
        subj : int
          Subject identifier.
        run : int
          Run ID.

        Returns
        -------
        list
          One item per event in the run. All items are dictionaries with the
          following keys: 'condition', 'onset', 'duration', 'intensity',
          'run', 'task', 'trial_idx', 'ctrial_idx', where the first is a
          literal label, the last four are integer IDs, and the rest are
          typically floating point values. 'onset_idx' is the index of the
          event specification sorted by time across the entire run (typically
          corresponding to a trial index), 'conset_idx' is analog but contains
          the onset index per condition, i.e. the nth trial of the respective
          condition in a run.
        """

        conditions = self.get_model_conditions(model)
        events = []
        ev_fields = ('onset', 'duration', 'intensity')

        # get onset info for specific subject/task/run combo
        for cond in conditions:
            task_id = cond['task']
            try:
                evdata = np.atleast_1d(
                    self._load_model_task_run_onsets(subj, model, task_id, run,
                                                     cond['id']))
            except IOError:
                warning("onset definition file not found; no information "
                        "about condition '%s' for run %i" %
                        (cond['name'], run))
                continue
            for i, ev in enumerate(evdata):
                evdict = dict(
                    list(zip(ev_fields, [ev[field] for field in ev_fields])))
                evdict['task'] = task_id
                evdict['condition'] = cond['name']
                evdict['run'] = run
                evdict['conset_idx'] = i
                events.append(evdict)
        events = sorted(events, key=lambda x: x['onset'])
        for i, ev in enumerate(events):
            ev['onset_idx'] = i
        return events
Exemple #50
0
 def _check_range(self, c):
     """ check and adjust the voxel coordinates"""
     # check range
     if __debug__:
         debug('ATL__', "Querying for voxel %r" % (c,))
     if not check_range(c, self.extent):
         warning("Coordinates %r are not within the extent %r." \
                 " Reseting to (0,0,0)" % (c, self.extent))
         # assume that voxel [0,0,0] is blank, i.e. carries
         # no labels which could possibly result in evil outcome
         c = [0]*3
     return c
Exemple #51
0
 def _check_range(self, c):
     """ check and adjust the voxel coordinates"""
     # check range
     if __debug__:
         debug('ATL__', "Querying for voxel %r" % (c, ))
     if not check_range(c, self.extent):
         warning("Coordinates %r are not within the extent %r." \
                 " Reseting to (0,0,0)" % (c, self.extent))
         # assume that voxel [0,0,0] is blank, i.e. carries
         # no labels which could possibly result in evil outcome
         c = [0] * 3
     return c
Exemple #52
0
def run(args):
    from mvpa2.base.hdf5 import h5save
    ds = None
    if not args.txt_data is None:
        verbose(1, "Load data from TXT file '%s'" % args.txt_data)
        samples = _load_from_txt(args.txt_data)
        ds = Dataset(samples)
    elif not args.npy_data is None:
        verbose(1, "Load data from NPY file '%s'" % args.npy_data)
        samples = _load_from_npy(args.npy_data)
        ds = Dataset(samples)
    elif not args.mri_data is None:
        verbose(1, "Load data from MRI image(s) %s" % args.mri_data)
        from mvpa2.datasets.mri import fmri_dataset
        vol_attr = dict()
        if not args.add_vol_attr is None:
            # XXX add a way to use the mapper of an existing dataset to
            # add a volume attribute without having to load the entire
            # mri data again
            vol_attr = dict(args.add_vol_attr)
            if not len(args.add_vol_attr) == len(vol_attr):
                warning("--vol-attr option with duplicate attribute name: "
                        "check arguments!")
            verbose(2, "Add volumetric feature attributes: %s" % vol_attr)
        ds = fmri_dataset(args.mri_data, mask=args.mask, add_fa=vol_attr)

    if ds is None:
        if args.data is None:
            raise RuntimeError('no data source specific')
        else:
            ds = hdf2ds(args.data)[0]
    else:
        if args.data is not None:
            verbose(1, 'ignoring dataset input in favor of other data source -- remove either one to disambiguate')

    # act on all attribute options
    ds = process_common_dsattr_opts(ds, args)

    if not args.add_fsl_mcpar is None:
        from mvpa2.misc.fsl.base import McFlirtParams
        mc_par = McFlirtParams(args.add_fsl_mcpar)
        for param in mc_par:
            verbose(2, "Add motion regressor as sample attribute '%s'"
                       % ('mc_' + param))
            ds.sa['mc_' + param] = mc_par[param]

    verbose(3, "Dataset summary %s" % (ds.summary()))
    # and store
    outfilename = args.output
    if not outfilename.endswith('.hdf5'):
        outfilename += '.hdf5'
    verbose(1, "Save dataset to '%s'" % outfilename)
    h5save(outfilename, ds, mkdir=True, compression=args.hdf5_compression)
Exemple #53
0
    def _forward_dataset_grouped(self, ds):
        mdata = []  # list of samples array pieces
        if self.__axis == 'samples':
            col = ds.sa
            axis = 0
        elif self.__axis == 'features':
            col = ds.fa
            axis = 1
        else:
            raise RuntimeError("This should not have happened!")

        attrs = dict(zip(col.keys(), [[] for i in col]))

        # create a dictionary for all unique elements in all attribute this
        # mapper should operate on
        self.__attrcombs = dict(
            zip(self.__uattrs, [col[attr].unique for attr in self.__uattrs]))
        # let it generate all combinations of unique elements in any attr
        for comb in _orthogonal_permutations(self.__attrcombs):
            selector = reduce(np.multiply, [
                array_whereequal(col[attr].value, value)
                for attr, value in comb.iteritems()
            ])
            # process the samples
            if axis == 0:
                samples = ds.samples[selector]
            else:
                samples = ds.samples[:, selector]

            # check if there were any samples for such a combination,
            # if not -- warning and skip the rest of the loop body
            if not len(samples):
                warning(
                    'There were no samples for combination %s. It might be '
                    'a sign of a disbalanced dataset %s.' % (comb, ds))
                continue

            fxed_samples = self.__smart_apply_along_axis(samples)
            mdata.append(fxed_samples)
            if not self.__attrfx is None:
                # and now all samples attributes
                fxed_attrs = [
                    self.__attrfx(col[attr].value[selector]) for attr in col
                ]
                for i, attr in enumerate(col):
                    attrs[attr].append(fxed_attrs[i])

        if axis == 0:
            mdata = np.vstack(mdata)
        else:
            mdata = np.vstack(np.transpose(mdata))
        return mdata, attrs
Exemple #54
0
    def predict(self, dataset):
        """Predict classifier on data

        Shouldn't be overridden in subclasses unless explicitly needed
        to do so. Also subclasses trying to call super class's predict
        should call _predict if within _predict instead of predict()
        since otherwise it would loop
        """
        ## ??? yoh: changed to asany from as without exhaustive check
        data = np.asanyarray(dataset.samples)
        if __debug__:
            # Verify that we have no NaN/Inf's which we do not "support" ATM
            if not np.all(np.isfinite(data)):
                raise ValueError(
                    "Some input data for predict is not finite (NaN or Inf)")
            debug("CLF", "Predicting classifier %s on ds %s", (self, dataset))

        # remember the time when started computing predictions
        t0 = time.time()

        ca = self.ca
        # to assure that those are reset (could be set due to testing
        # post-training)
        ca.reset(['estimates', 'predictions'])

        self._prepredict(dataset)

        if self.__trainednfeatures > 0 \
               or 'notrain2predict' in self.__tags__:
            result = self._predict(dataset)
        else:
            warning(
                "Trying to predict using classifier trained on no features")
            if __debug__:
                debug("CLF",
                      "No features were present for training, prediction is " \
                      "bogus")
            result = [None] * data.shape[0]

        ca.predicting_time = time.time() - t0

        # with labels mapping in-place, we also need to go back to the
        # literal labels
        if self._attrmap:
            try:
                result = self._attrmap.to_literal(result)
            except KeyError as e:
                raise FailedToPredictError("Failed to convert predictions from numeric into " \
                      "literals: %s" % e)

        self._postpredict(dataset, result)
        return result
Exemple #55
0
def __assign_nibabel_version():
    try:
        import nibabel
    except Exception, e:
        # FloatingError is defined in the same module which precludes
        # its specific except
        e_str = str(e)
        if "We had not expected long double type <type 'numpy.float128'>" in e_str:
            warning("Must be running under valgrind?  Available nibabel experiences "
                    "difficulty with float128 upon import and fails to work, thus is "
                    "report as N/A")
            raise ImportError("Fail to import nibabel due to %s" % e_str)
        raise
Exemple #56
0
def _warn_if_fmri_dataset(ds):
    assert (isinstance(ds, AttrDataset))

    fmri_fields = set(('imgaffine', 'imgtype', 'imghdr'))

    ds_fmri_fields = set.intersection(set(ds.a.keys()), fmri_fields)

    if len(ds_fmri_fields) > 0:
        warning('dataset attribute .a has fields %s, which suggest it is an '
                'volumetric dataset. Converting this dataset to GIFTI '
                'format will most likely result in unvisualiable '
                '(and potentially, un-analysable) data. Consider using '
                'map2nifti instead' % (', '.join(ds_fmri_fields)))