Ejemplo n.º 1
0
class HalfSplitter(Splitter):
    """Split a dataset into two halves of the sample attribute.

    The splitter yields to splits: first (1st half, 2nd half) and second
    (2nd half, 1st half).
    """
    def __init__(self, **kwargs):
        Splitter.__init__(self, **(kwargs))

    __doc__ = enhanced_doc_string('HalfSplitter', locals(), Splitter)

    ##REF: Name was automagically refactored
    def _get_split_config(self, uniqueattrs):
        """
        Returns
        -------
        list of tuples (None, list of int)
          2 items: first half of samples into 1st split
        """
        return [(None, uniqueattrs[:len(uniqueattrs) / 2]),
                (None, uniqueattrs[len(uniqueattrs) / 2:])]

    def __str__(self):
        """String summary over the object
        """
        return \
          "HalfSplitter / " + Splitter.__str__(self)
Ejemplo n.º 2
0
class PyMVPAAtlas(XMLBasedAtlas):
    """Base class for PyMVPA atlases, such as LabelsAtlas and ReferenceAtlas
    """

    source = 'PyMVPA'

    def __init__(self, *args, **kwargs):
        XMLBasedAtlas.__init__(self, *args, **kwargs)

        # sanity checks
        header = self.header
        headerChildrenTags = XMLBasedAtlas._children_tags(header)
        if not ('space' in headerChildrenTags) or \
           not ('space-flavor' in headerChildrenTags):
            raise XMLAtlasException("PyMVPA Atlas requires specification of" +
                                    " the space in which atlas resides")

        self.__space = header.space.text
        self.__spaceFlavor = header['space-flavor'].text

    __doc__ = enhanced_doc_string('PyMVPAAtlas', locals(), XMLBasedAtlas)

    ##REF: Name was automagically refactored
    def _load_images(self):
        # shortcut
        imagefile = self.header.images.imagefile
        #self.nlevels = len(self._levels_by_id)

        # Set offset if defined in XML file
        # XXX: should just take one from the qoffset... now that one is
        #       defined... this origin might be misleading actually
        self._origin = np.array((0, 0, 0))
        if imagefile.attrib.has_key('offset'):
            self._origin = np.array(
                [int(x) for x in imagefile.get('offset').split(',')])

        # Load the image file which has labels
        if self._force_image_file is not None:
            imagefilename = self._force_image_file
        else:
            imagefilename = imagefile.text
        imagefilename = reuse_absolute_path(self._filename, imagefilename)

        try:
            self._image = NiftiImage(imagefilename)
        except RuntimeError, e:
            raise RuntimeError, \
                  " Cannot open file %s due to %s" % (imagefilename, e)

        self._data = self._image.data

        # remove bogus dimensions on top of 4th
        if len(self._data.shape[0:-4]) > 0:
            bogus_dims = self._data.shape[0:-4]
            if max(bogus_dims) > 1:
                raise RuntimeError, "Atlas %s has more than 4 of non-singular" \
                      "dimensions" % imagefilename
            new_shape = self._data.shape[-4:]
            self._data.reshape(new_shape)
Ejemplo n.º 3
0
class NFoldSplitter(Splitter):
    """Generic N-fold data splitter.

    Provide folding splitting. Given a dataset with N chunks, with
    cvtype=1 (which is default), it would generate N splits, where
    each chunk sequentially is taken out (with replacement) for
    cross-validation.  Example, if there is 4 chunks, splits for
    cvtype=1 are::

        [[1, 2, 3], [0]]
        [[0, 2, 3], [1]]
        [[0, 1, 3], [2]]
        [[0, 1, 2], [3]]

    If cvtype>1, then all possible combinations of cvtype number of
    chunks are taken out for testing, so for cvtype=2 in previous
    example::

        [[2, 3], [0, 1]]
        [[1, 3], [0, 2]]
        [[1, 2], [0, 3]]
        [[0, 3], [1, 2]]
        [[0, 2], [1, 3]]
        [[0, 1], [2, 3]]

    """
    def __init__(self, cvtype=1, **kwargs):
        """Initialize the N-fold splitter.

        Parameters
        ----------
        cvtype : int
          Type of cross-validation: N-(cvtype)
        **kwargs
          Additional parameters are passed to the `Splitter` base class.
        """
        Splitter.__init__(self, **(kwargs))

        # pylint happiness block
        self.__cvtype = cvtype

    __doc__ = enhanced_doc_string('NFoldSplitter', locals(), Splitter)

    def __str__(self):
        """String summary over the object
        """
        return \
          "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self)

    def _get_split_config(self, uniqueattrs):
        """Returns proper split configuration for N-M fold split.
        """
        return [(None, i) for i in \
                 support.xunique_combinations(uniqueattrs, self.__cvtype)]
Ejemplo n.º 4
0
class NGroupSplitter(Splitter):
    """Split a dataset into N-groups of the sample attribute.

    For example, NGroupSplitter(2) is the same as the HalfSplitter and
    yields to splits: first (1st half, 2nd half) and second (2nd half,
    1st half).
    """
    def __init__(self, ngroups=4, **kwargs):
        """Initialize the N-group splitter.

        Parameters
        ----------
        ngroups : int
          Number of groups to split the attribute into.
        **kwargs
          Additional parameters are passed to the `Splitter` base class.
        """
        Splitter.__init__(self, **(kwargs))

        self.__ngroups = ngroups

    __doc__ = enhanced_doc_string('NGroupSplitter', locals(), Splitter)

    ##REF: Name was automagically refactored
    def _get_split_config(self, uniqueattrs):
        """
        Returns
        -------
        list of tuples (None, list of int)
          Indices for splitting
        """

        # make sure there are more of attributes than desired groups
        if len(uniqueattrs) < self.__ngroups:
            raise ValueError, "Number of groups (%d) " % (self.__ngroups) + \
                  "must be less than " + \
                  "or equal to the number of unique attributes (%d)" % \
                  (len(uniqueattrs))

        # use coarsen_chunks to get the split indices
        split_ind = coarsen_chunks(uniqueattrs, nchunks=self.__ngroups)
        split_ind = np.asarray(split_ind)

        # loop and create splits
        split_list = [(None, uniqueattrs[split_ind == i])
                      for i in range(self.__ngroups)]
        return split_list

    def __str__(self):
        """String summary over the object
        """
        return \
          "N-%d-GroupSplitter / " % self.__ngroup + Splitter.__str__(self)
Ejemplo n.º 5
0
class CustomSplitter(Splitter):
    """Split a dataset using an arbitrary custom rule.

    The splitter is configured by passing a custom spitting rule (`splitrule`)
    to its constructor. Such a rule is basically a sequence of split
    definitions. Every single element in this sequence results in excatly one
    split generated by the Splitter. Each element is another sequence for
    sequences of sample ids for each dataset that shall be generated in the
    split.

    Examples
    --------
    Generate two splits. In the first split the *second* dataset
    contains all samples with sample attributes corresponding to
    either 0, 1 or 2. The *first* dataset of the first split contains
    all samples which are not split into the second dataset.

    The second split yields three datasets. The first with all samples
    corresponding to sample attributes 1 and 2, the second dataset
    contains only samples with attrbiute 3 and the last dataset
    contains the samples with attribute 5 and 6.

    >>> splitter = CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])])
    """
    def __init__(self, splitrule, **kwargs):
        """
        Parameters
        ----------
        splitrule : list of tuple
          Custom splits to use
        """
        Splitter.__init__(self, **(kwargs))

        self.__splitrule = splitrule

    __doc__ = enhanced_doc_string('CustomSplitter', locals(), Splitter)

    ##REF: Name was automagically refactored
    def _get_split_config(self, uniqueattrs):
        """
        Returns
        -------
        whatever was provided in splitrule argument
        """
        return self.__splitrule

    def __str__(self):
        """String summary over the object
        """
        return "CustomSplitter / " + Splitter.__str__(self)
Ejemplo n.º 6
0
class OddEvenSplitter(Splitter):
    """Split a dataset into odd and even values of the sample attribute.

    The splitter yields to splits: first (odd, even) and second (even, odd).
    """
    def __init__(self, usevalues=False, **kwargs):
        """
        Parameters
        ----------
        usevalues : bool
          If True the values of the attribute used for splitting will be
          used to determine odd and even samples. If False odd and even
          chunks are defined by the order of attribute values, i.e. first
          unique attribute is odd, second is even, despite the
          corresponding values might indicate the opposite (e.g. in case
          of [2,3].
        """
        Splitter.__init__(self, **(kwargs))

        self.__usevalues = usevalues

    __doc__ = enhanced_doc_string('OddEvenSplitter', locals(), Splitter)

    ##REF: Name was automagically refactored
    def _get_split_config(self, uniqueattrs):
        """
        Returns
        -------
        list of tuples (None, list of int)
          2 items: odd samples into 1st split
        """
        if self.__usevalues:
            return [(None, uniqueattrs[(uniqueattrs % 2) == True]),
                    (None, uniqueattrs[(uniqueattrs % 2) == False])]
        else:
            return [
                (None, uniqueattrs[np.arange(len(uniqueattrs)) % 2 == True]),
                (None, uniqueattrs[np.arange(len(uniqueattrs)) % 2 == False])
            ]

    def __str__(self):
        """String summary over the object
        """
        return \
          "OddEvenSplitter / " + Splitter.__str__(self)
Ejemplo n.º 7
0
class NoneSplitter(Splitter):
    """This is a dataset splitter that does **not** split. It simply returns
    the full dataset that it is called with.

    The passed dataset is returned as the second element of the 2-tuple.
    The first element of that tuple will always be 'None'.
    """

    _known_modes = ['first', 'second']

    def __init__(self, mode='second', **kwargs):
        """
        Parameters
        ----------
        mode
          Either 'first' or 'second' (default) -- which output dataset
          would actually contain the samples
        """
        Splitter.__init__(self, **(kwargs))

        if not mode in NoneSplitter._known_modes:
            raise ValueError, "Unknown mode %s for NoneSplitter" % mode
        self.__mode = mode

    __doc__ = enhanced_doc_string('NoneSplitter', locals(), Splitter)

    ##REF: Name was automagically refactored
    def _get_split_config(self, uniqueattrs):
        """Return just one full split: no first or second dataset.
        """
        if self.__mode == 'second':
            return [([], None)]
        else:
            return [(None, [])]

    def __str__(self):
        """String summary over the object
        """
        return \
          "NoneSplitter / " + Splitter.__str__(self)
Ejemplo n.º 8
0
class LabelsAtlas(PyMVPAAtlas):
    """
    Atlas which provides labels for the given coordinate
    """

    ##REF: Name was automagically refactored
    def label_voxel(self, c, levels=None):
        """
        Return labels for the given voxel at specified levels specified by index
        """
        levels = self._get_selected_levels(levels=levels)

        result = {'voxel_queried': c}

        # check range
        c = self._check_range(c)

        resultLevels = []
        for level in levels:
            if self._levels.has_key(level):
                level_ = self._levels[level]
            else:
                raise IndexError("Unknown index or description for level %d" %
                                 level)

            resultIndex =  int(self._data[ level_.index, \
                                            c[2], c[1], c[0] ])

            resultLevels += [{
                'index': level_.index,
                'id': level_.description,
                'label': level_[resultIndex]
            }]

        result['labels'] = resultLevels
        return result

    __doc__ = enhanced_doc_string('LabelsAtlas', locals(), PyMVPAAtlas)
Ejemplo n.º 9
0
class ProjectionMapper(Mapper):
    """Linear mapping between multidimensional spaces.

    This class cannot be used directly. Sub-classes have to implement
    the `_train()` method, which has to compute the projection matrix
    `_proj` and optionally offset vectors `_offset_in` and
    `_offset_out` (if initialized with demean=True, which is default)
    given a dataset (see `_train()` docstring for more information).

    Once the projection matrix is available, this class provides
    functionality to perform forward and backwards linear mapping of
    data, the latter by default using pseudo-inverse (but could be
    altered in subclasses, like hermitian (conjugate) transpose in
    case of SVD).  Additionally, `ProjectionMapper` supports optional
    selection of arbitrary component (i.e. columns of the projection
    matrix) of the projection.

    Forward and back-projection matrices (a.k.a. *projection* and
    *reconstruction*) are available via the `proj` and `recon`
    properties.
    """

    _DEV__doc__ = """Think about renaming `demean`, may be `translation`?"""

    def __init__(self, selector=None, demean=True):
        """Initialize the ProjectionMapper

        Parameters
        ----------
        selector : None or list
          Which components (i.e. columns of the projection matrix)
          should be used for mapping. If `selector` is `None` all
          components are used. If a list is provided, all list
          elements are treated as component ids and the respective
          components are selected (all others are discarded).
        demean : bool
          Either data should be demeaned while computing
          projections and applied back while doing reverse()
        """
        Mapper.__init__(self)

        # by default we want to wipe the feature attributes out during mapping
        self._fa_filter = []

        self._selector = selector
        self._proj = None
        """Forward projection matrix."""
        self._recon = None
        """Reverse projection (reconstruction) matrix."""
        self._demean = demean
        """Flag whether to demean the to be projected data, prior to projection.
        """
        self._offset_in = None
        """Offset (most often just mean) in the input space"""
        self._offset_out = None
        """Offset (most often just mean) in the output space"""

    __doc__ = enhanced_doc_string('ProjectionMapper', locals(), Mapper)

    @accepts_dataset_as_samples
    def _pretrain(self, samples):
        """Determine the projection matrix.

        Parameters
        ----------
        dataset : Dataset
          Dataset to operate on
        """
        if self._demean:
            self._offset_in = samples.mean(axis=0)

    def _posttrain(self, dataset):
        # perform component selection
        if self._selector is not None:
            self.select_out(self._selector)

    ##REF: Name was automagically refactored
    def _demean_data(self, data):
        """Helper which optionally demeans
        """
        if self._demean:
            # demean the training data
            data = data - self._offset_in

            if __debug__ and "MAP_" in debug.active:
                debug(
                    "MAP_",
                    "%s: Mean of data in input space %s was subtracted" %
                    (self.__class__.__name__, self._offset_in))
        return data

    def _forward_data(self, data):
        if self._proj is None:
            raise RuntimeError, "Mapper needs to be train before used."

        # local binding
        demean = self._demean

        d = np.asmatrix(data)

        # Remove input offset if present
        if demean and self._offset_in is not None:
            d = d - self._offset_in

        # Do forward projection
        res = (d * self._proj).A

        # Add output offset if present
        if demean and self._offset_out is not None:
            res += self._offset_out

        return res

    def _reverse_data(self, data):
        if self._proj is None:
            raise RuntimeError, "Mapper needs to be trained before used."
        d = np.asmatrix(data)
        # Remove offset if present in output space
        if self._demean and self._offset_out is not None:
            d = d - self._offset_out

        # Do reverse projection
        res = (d * self.recon).A

        # Add offset in input space
        if self._demean and self._offset_in is not None:
            res += self._offset_in

        return res

    ##REF: Name was automagically refactored
    def _compute_recon(self):
        """Given that a projection is present -- compute reconstruction matrix.
        By default -- pseudoinverse of projection matrix.  Might be overridden
        in derived classes for efficiency.
        """
        return np.linalg.pinv(self._proj)

    ##REF: Name was automagically refactored
    def _get_recon(self):
        """Compute (if necessary) and return reconstruction matrix
        """
        # (re)build reconstruction matrix
        recon = self._recon
        if recon is None:
            self._recon = recon = self._compute_recon()
        return recon

    proj = property(fget=lambda self: self._proj, doc="Projection matrix")
    recon = property(fget=_get_recon, doc="Backprojection matrix")
Ejemplo n.º 10
0
class FSLAtlas(XMLBasedAtlas):
    """Base class for FSL atlases

    """
    source = 'FSL'

    def __init__(self, *args, **kwargs):
        """
        """
        XMLBasedAtlas.__init__(self, *args, **kwargs)
        self.space = 'MNI'

    __doc__ = enhanced_doc_string('FSLAtlas', locals(), XMLBasedAtlas)

    ##REF: Name was automagically refactored
    def _load_images(self):
        resolution = self._resolution
        header = self.header
        images = header.images
        # Load present images
        # XXX might be refactored to avoid duplication of
        #     effort with PyMVPAAtlas
        ni_image = None
        resolutions = []
        if self._force_image_file is None:
            imagefile_candidates = [
                reuse_absolute_path(self._filename,
                                    i.imagefile.text,
                                    force=True) for i in images
            ]
        else:
            imagefile_candidates = [self._force_image_file]

        for imagefilename in imagefile_candidates:
            try:
                ni_image_ = NiftiImage(imagefilename, load=False)
            except RuntimeError, e:
                raise RuntimeError, " Cannot open file " + imagefilename

            resolution_ = ni_image_.pixdim[0]
            if resolution is None:
                # select this one if the best
                if ni_image is None or \
                       resolution_ < ni_image.pixdim[0]:
                    ni_image = ni_image_
                    self._image_file = imagefilename
            else:
                if resolution_ == resolution:
                    ni_image = ni_image_
                    self._image_file = imagefilename
                    break
                else:
                    resolutions += [resolution_]
            # TODO: also make use of summaryimagefile may be?

        if ni_image is None:
            msg = "Could not find an appropriate atlas among %d atlases." \
                  % len(imagefile_candidates)
            if resolution is not None:
                msg += " Atlases had resolutions %s" % \
                      (resolutions,)
            raise RuntimeError, msg
        if __debug__:
            debug('ATL__', "Loading atlas data from %s" % self._image_file)
        self._image = ni_image
        self._resolution = ni_image.pixdim[0]
        self._origin = np.abs(ni_image.header['qoffset']) * 1.0  # XXX
        self._data = self._image.data
Ejemplo n.º 11
0
class FSLProbabilisticAtlas(FSLAtlas):
    """Probabilistic FSL atlases
    """
    def __init__(self, thr=0.0, strategy='all', sort=True, *args, **kwargs):
        """

        Parameters
        ----------
        thr : float
          Value to threshold at
        strategy : str
          Possible values
            all - all entries above thr
            max - entry with maximal value
        sort : bool
          Either to sort entries for 'all' strategy according to
          probability
        """

        FSLAtlas.__init__(self, *args, **kwargs)
        self.thr = thr
        self.strategy = strategy
        self.sort = sort

    __doc__ = enhanced_doc_string('FSLProbabilisticAtlas', locals(), FSLAtlas)

    ##REF: Name was automagically refactored
    def label_voxel(self, c, levels=None):
        """Return labels for the voxel

        Parameters
        ----------
        c : tuple of coordinates (xyz)
        - levels : just for API consistency (heh heh). Must be 0 for FSL atlases
        """

        if levels is not None and not (levels in [0, [0], (0, )]):
            raise ValueError, \
                  "I guess we don't support levels other than 0 in FSL atlas." \
                  " Got levels=%s" % (levels,)
        # check range
        c = self._check_range(c)

        # XXX think -- may be we should better assign each map to a
        # different level
        level = 0
        resultLabels = []
        for index, area in enumerate(self._levels[level]):
            prob = int(self._data[index, c[2], c[1], c[0]])
            if prob > self.thr:
                resultLabels += [
                    dict(
                        index=index,
                        #id=
                        label=area.text,
                        prob=prob)
                ]

        if self.sort or self.strategy == 'max':
            resultLabels.sort(cmp=lambda x, y: cmp(x['prob'], y['prob']),
                              reverse=True)

        if self.strategy == 'max':
            resultLabels = resultLabels[:1]
        elif self.strategy == 'all':
            pass
        else:
            raise ValueError, 'Unknown strategy %s' % self.strategy

        result = {
            'voxel_queried': c,
            # in the list since we have only single level but
            # with multiple entries
            'labels': [resultLabels]
        }

        return result

    def find(self, *args, **kwargs):
        """Just a shortcut to the only level.

        See :class:`~mvpa.atlases.base.Level.find` for more info
        """
        return self.levels[0].find(*args, **kwargs)

    def get_map(self, target, strategy='unique', axes_order='xyz'):
        """Return a probability map as an array

        Parameters
        ----------
        target : int or str or re._pattern_type
          If int, map for given index is returned. Otherwise, .find is called
          with ``unique=True`` to find matching area
        strategy : str in ('unique', 'max')
          If 'unique', then if multiple areas match, exception would be raised.
          In case of 'max', each voxel would get maximal value of probabilities
          from all matching areas
        axes_order : str in ('xyz', 'zyx')
          In what order axes of the returned array should follow.
        """
        if isinstance(target, int):
            res = self._data[target]
            if axes_order == 'xyz':
                # ATM we store/access in zyx (kji) order, so we would need
                # to swap
                return res.T
            elif axes_order == 'zyx':
                return res
            else:
                raise ValueError, \
                      "Unknown axes_order=%r provided" % (axes_order,)
        else:
            lev = self.levels[0]  # we have just 1 here
            if strategy == 'unique':
                return self.get_map(lev.find(target, unique=True).index,
                                    axes_order=axes_order)
            else:
                maps_dict = self.get_maps(target, axes_order=axes_order)
                maps = np.array(maps_dict.values())
                return np.max(maps, axis=0)

    def get_maps(self, target, axes_order='xyz', key_attr=None, overlaps=None):
        """Return a dictionary of probability maps for the target

        Each key is a `Label` instance, and value is the probability map

        Parameters
        ----------
        target : str or re._pattern_type
          .find is called with a target and unique=False to find all matches
        axes_order : str in ('xyz', 'zyx')
          In what order axes of the returned array should follow.
        key_attr : None or str
          What to use for the keys of the dictionary.  If None,
          `Label` instance would be used as a key.  If some attribute
          provided (e.g. 'text', 'abbr', 'index'), corresponding
          attribute of the `Label` instance would be taken as a key.
        overlaps : None or {'max'}
          How to treat overlaps in maps.  If None, nothing is done and maps
          might have overlaps.  If 'max', then maps would not overlap and
          competing maps will be resolved based on maximal value (e.g. if
          maps contain probabilities).
        """
        lev = self.levels[0]  # we have just 1 here
        if key_attr is None:
            key_gen = lambda x: x
        else:
            key_gen = lambda x: getattr(x, key_attr)

        res = [[key_gen(l),
                self.get_map(l.index, axes_order=axes_order)]
               for l in lev.find(target, unique=False)]

        if overlaps == 'max':
            # not efficient since it places all maps back into a single
            # ndarray... but well
            maps = np.array([x[1] for x in res])
            maximums = np.argmax(maps, axis=0)
            overlaps = np.sum(maps != 0, axis=0) > 1
            # now lets go and infiltrate maps:
            # and do silly loop since we will reassign
            # the entries possibly
            for i in xrange(len(res)):
                n, m = res[i]
                loosers = np.logical_and(overlaps, ~(maximums == i))
                if len(loosers):
                    # copy and modify
                    m_new = m.copy()
                    m_new[loosers] = 0
                    res[i][1] = m_new
        elif overlaps is None:
            pass
        else:
            raise ValueError, \
                  "Incorrect value of overlaps argument %s" % overlaps
        return dict(res)
Ejemplo n.º 12
0
class ColumnData(dict):
    """Read data that is stored in columns of text files.

    All read data is available via a dictionary-like interface. If
    column headers are available, the column names serve as dictionary keys.
    If no header exists an articfical key is generated: str(number_of_column).

    Splitting of text file lines is performed by the standard split() function
    (which gets passed the `sep` argument as separator string) and each
    element is converted into the desired datatype.

    Because data is read into a dictionary no two columns can have the same
    name in the header! Each column is stored as a list in the dictionary.
    """
    def __init__(self,
                 source,
                 header=True,
                 sep=None,
                 headersep=None,
                 dtype=float,
                 skiplines=0):
        """Read data from file into a dictionary.

        Parameters
        ----------
        source : str or dict
          If values is given as a string all data is read from the
          file and additonal keyword arguments can be sued to
          customize the read procedure. If a dictionary is passed
          a deepcopy is performed.
        header : bool or list of str
          Indicates whether the column names should be read from the
          first line (`header=True`). If `header=False` unique
          column names will be generated (see class docs). If
          `header` is a python list, it's content is used as column
          header names and its length has to match the number of
          columns in the file.
        sep : str or None
          Separator string. The actual meaning depends on the output
          format (see class docs).
        headersep : str or None
          Separator string used in the header. The actual meaning
          depends on the output format (see class docs).
        dtype : type or list(types)
          Desired datatype(s). Datatype per column get be specified by
          passing a list of types.
        skiplines : int
          Number of lines to skip at the beginning of the file.
        """
        # init base class
        dict.__init__(self)

        # intialize with default
        self._header_order = None

        if isinstance(source, str):
            self._from_file(source,
                            header=header,
                            sep=sep,
                            headersep=headersep,
                            dtype=dtype,
                            skiplines=skiplines)

        elif isinstance(source, dict):
            for k, v in source.iteritems():
                self[k] = v
            # check data integrity
            self._check()

        else:
            raise ValueError, 'Unkown source for ColumnData [%r]' \
                              % type(source)

        # generate missing properties for each item in the header
        classdict = self.__class__.__dict__
        for k in self.keys():
            if not classdict.has_key(k):
                getter = "lambda self: self._get_attrib('%s')" % (k)
                # Sanitarize the key, substitute ' []' with '_'
                k_ = sub('[[\] ]', '_', k)
                # replace multipe _s
                k_ = sub('__+', '_', k_)
                # remove quotes
                k_ = sub('["\']', '', k_)
                if __debug__:
                    debug(
                        "IOH",
                        "Registering property %s for ColumnData key %s" %
                        (k_, k))
                # make sure to import class directly into local namespace
                # otherwise following does not work for classes defined
                # elsewhere
                exec 'from %s import %s' % (self.__module__,
                                            self.__class__.__name__)
                exec "%s.%s = property(fget=%s)"  % \
                     (self.__class__.__name__, k_, getter)
                # TODO!!! Check if it is safe actually here to rely on value of
                #         k in lambda. May be it is treated as continuation and
                #         some local space would override it????
                #setattr(self.__class__,
                #        k,
                #        property(fget=lambda x: x._get_attrib("%s" % k)))
                # it seems to be error-prone due to continuation...

    __doc__ = enhanced_doc_string('ColumnData', locals())

    ##REF: Name was automagically refactored
    def _get_attrib(self, key):
        """Return corresponding value if given key is known to current instance

        Is used for automatically added properties to the class.

        Raises
        ------
        ValueError:
          If `key` is not known to given instance

        Returns
        -------
        Value if `key` is known
        """
        if self.has_key(key):
            return self[key]
        else:
            raise ValueError, "Instance %r has no data about %r" \
                % (self, key)

    def __str__(self):
        s = self.__class__.__name__
        if len(self.keys()) > 0:
            s += " %d rows, %d columns [" % \
                 (self.nrows, self.ncolumns)
            s += reduce(lambda x, y: x + " %s" % y, self.keys())
            s += "]"
        return s

    def _check(self):
        """Performs some checks for data integrity.
        """
        length = None
        for k in self.keys():
            if length == None:
                length = len(self[k])
            else:
                if not len(self[k]) == length:
                    raise ValueError, "Data integrity lost. Columns do not " \
                                      "have equal length."

    def _from_file(self, filename, header, sep, headersep, dtype, skiplines):
        """Loads column data from file -- clears object first.
        """
        # make a clean table
        self.clear()

        file_ = open(filename, 'r')

        self._header_order = None

        [file_.readline() for x in range(skiplines)]
        """Simply skip some lines"""
        # make column names, either take header or generate
        if header == True:
            # read first line and split by 'sep'
            hdr = file_.readline().split(headersep)
            # remove bogus empty header titles
            hdr = [x for x in hdr if len(x.strip())]
            self._header_order = hdr
        elif isinstance(header, list):
            hdr = header
        else:
            hdr = [str(i) for i in xrange(len(file_.readline().split(sep)))]
            # reset file to not miss the first line
            file_.seek(0)
            [file_.readline() for x in range(skiplines)]

        # string in lists: one per column
        tbl = [[] for i in xrange(len(hdr))]

        # do per column dtypes
        if not isinstance(dtype, list):
            dtype = [dtype] * len(hdr)

        # parse line by line and feed into the lists
        for line in file_:
            # get rid of leading and trailing whitespace
            line = line.strip()
            # ignore empty lines and comment lines
            if not line or line.startswith('#'):
                continue
            l = line.split(sep)

            if not len(l) == len(hdr):
                raise RuntimeError, \
                      "Number of entries in line [%i] does not match number " \
                      "of columns in header [%i]." % (len(l), len(hdr))

            for i, v in enumerate(l):
                if not dtype[i] is None:
                    try:
                        v = dtype[i](v)
                    except ValueError:
                        warning("Can't convert %r to desired datatype %r." %
                                (v, dtype) + " Leaving original type")
                tbl[i].append(v)

        # check
        if not len(tbl) == len(hdr):
            raise RuntimeError, "Number of columns read from file does not " \
                                "match the number of header entries."

        # fill dict
        for i, v in enumerate(hdr):
            self[v] = tbl[i]

    def __iadd__(self, other):
        """Merge column data.
        """
        # for all columns in the other object
        for k, v in other.iteritems():
            if not self.has_key(k):
                raise ValueError, 'Unknown key [%r].' % (k, )
            if not isinstance(v, list):
                raise ValueError, 'Can only merge list data, but got [%r].' \
                                  % type(v)
            # now it seems to be ok
            # XXX check for datatype?
            self[k] += v

        # look for problems, like columns present in self, but not in other
        self._check()

        return self

    ##REF: Name was automagically refactored
    def select_samples(self, selection):
        """Return new ColumnData with selected samples"""

        data = copy.deepcopy(self)
        for k, v in data.iteritems():
            data[k] = [v[x] for x in selection]

        data._check()
        return data

    @property
    def ncolumns(self):
        """Returns the number of columns.
        """
        return len(self.keys())

    def tofile(self, filename, header=True, header_order=None, sep=' '):
        """Write column data to a text file.

        Parameters
        ----------
        filename : str
          Target filename
        header : bool, optional
          If `True` a column header is written, using the column
          keys. If `False` no header is written.
        header_order : None or list of str
          If it is a list of strings, they will be used instead
          of simply asking for the dictionary keys. However
          these strings must match the dictionary keys in number
          and identity. This argument type can be used to
          determine the order of the columns in the output file.
          The default value is `None`. In this case the columns
          will be in an arbitrary order.
        sep : str, optional
          String that is written as a separator between to data columns.
        """
        # XXX do the try: except: dance
        file_ = open(filename, 'w')

        # write header
        if header_order == None:
            if self._header_order is None:
                col_hdr = self.keys()
            else:
                # use stored order + newly added keys at the last columns
                col_hdr = self._header_order + \
                          list(Set(self.keys()).difference(
                                                Set(self._header_order)))
        else:
            if not len(header_order) == self.ncolumns:
                raise ValueError, 'Header list does not match number of ' \
                                  'columns.'
            for k in header_order:
                if not self.has_key(k):
                    raise ValueError, 'Unknown key [%r]' % (k, )
            col_hdr = header_order

        if header == True:
            file_.write(sep.join(col_hdr) + '\n')

        # for all rows
        for r in xrange(self.nrows):
            # get attributes for all keys
            l = [str(self[k][r]) for k in col_hdr]
            # write to file with proper separator
            file_.write(sep.join(l) + '\n')

        file_.close()

    @property
    def nrows(self):
        """Returns the number of rows.
        """
        # no data no rows (after Bob Marley)
        if not len(self.keys()):
            return 0
        # otherwise first key is as good as any other
        else:
            return len(self[self.keys()[0]])
Ejemplo n.º 13
0
class ReferencesAtlas(PyMVPAAtlas):
    """
    Atlas which provides references to the other atlases.

    Example: the atlas which has references to the closest points
    (closest Gray, etc) in another atlas.
    """
    def __init__(self, distance=0, *args, **kwargs):
        """Initialize `ReferencesAtlas`
        """
        PyMVPAAtlas.__init__(self, *args, **kwargs)
        # sanity checks
        if not ('reference-atlas' in XMLBasedAtlas._children_tags(
                self.header)):
            raise XMLAtlasException(
                "ReferencesAtlas must refer to a some other atlas")

        referenceAtlasName = self.header["reference-atlas"].text

        # uff -- another evil import but we better use the factory method
        from mvpa.atlases.warehouse import Atlas
        self.__referenceAtlas = Atlas(
            filename=reuse_absolute_path(self._filename, referenceAtlasName))

        if self.__referenceAtlas.space != self.space or \
           self.__referenceAtlas.space_flavor != self.space_flavor:
            raise XMLAtlasException(
                "Reference and original atlases should be in the same space")

        self.__referenceLevel = None
        self.set_distance(distance)

    __doc__ = enhanced_doc_string('ReferencesAtlas', locals(), PyMVPAAtlas)

    # number of levels must be of the referenced atlas due to
    # handling of that in __getitem__
    #nlevels = property(fget=lambda self:self.__referenceAtlas.nlevels)
    ##REF: Name was automagically refactored
    def _get_nlevels_virtual(self):
        return self.__referenceAtlas.nlevels

    ##REF: Name was automagically refactored
    def set_reference_level(self, level):
        """
        Set the level which will be queried
        """
        if self._levels.has_key(level):
            self.__referenceLevel = self._levels[level]
        else:
            raise IndexError, \
                  "Unknown reference level %r. " % level + \
                  "Known are %r" % (self._levels.keys(), )

    ##REF: Name was automagically refactored
    def label_voxel(self, c, levels=None):

        if self.__referenceLevel is None:
            warning("You did not provide what level to use "
                    "for reference. Assigning 0th level -- '%s'" %
                    (self._levels[0], ))
            self.set_reference_level(0)
            # return self.__referenceAtlas.label_voxel(c, levels)

        c = self._check_range(c)

        # obtain coordinates of the closest voxel
        cref = self._data[self.__referenceLevel.indexes, c[2], c[1], c[0]]
        dist = norm((cref - c) * self.voxdim)
        if __debug__:
            debug(
                'ATL__', "Closest referenced point for %r is "
                "%r at distance %3.2f" % (c, cref, dist))
        if (self.distance - dist) >= 1e-3:  # neglect everything smaller
            result = self.__referenceAtlas.label_voxel(cref, levels)
            result['voxel_referenced'] = c
            result['distance'] = dist
        else:
            result = self.__referenceAtlas.label_voxel(c, levels)
            if __debug__:
                debug(
                    'ATL__', "Closest referenced point is "
                    "further than desired distance %.2f" % self.distance)
            result['voxel_referenced'] = None
            result['distance'] = 0
        return result

    ##REF: Name was automagically refactored
    def levels_listing(self):
        return self.__referenceAtlas.levels_listing()

    ##REF: Name was automagically refactored
    def _get_levels_virtual(self):
        return self.__referenceAtlas.levels

    ##REF: Name was automagically refactored
    def set_distance(self, distance):
        """Set desired maximal distance for the reference
        """
        if distance < 0:
            raise ValueError("Distance should not be negative. "
                             " Thus '%f' is not a legal value" % distance)
        if __debug__:
            debug('ATL__',
                  "Setting maximal distance for queries to be %d" % distance)
        self.__distance = distance

    distance = property(fget=lambda self: self.__distance, fset=set_distance)
Ejemplo n.º 14
0
    def __init__(cls, name, bases, dict):
        """
        Parameters
        ----------
        name : str
          Name of the class
        bases : iterable
          Base classes
        dict : dict
          Attributes.
        """
        if __debug__:
            debug(
                "COLR",
                "AttributesCollector call for %s.%s, where bases=%s, dict=%s " \
                % (cls, name, bases, dict))

        super(AttributesCollector, cls).__init__(name, bases, dict)

        collections = {}
        for name, value in dict.iteritems():
            if isinstance(value, IndexedCollectable):
                baseclassname = value.__class__.__name__
                col = _known_collections[baseclassname][0]
                # XXX should we allow to throw exceptions here?
                if not collections.has_key(col):
                    collections[col] = {}
                collections[col][name] = value
                # and assign name if not yet was set
                if value.name is None:
                    value.name = name
                # !!! We do not keep copy of this attribute static in the class.
                #     Due to below traversal of base classes, we should be
                #     able to construct proper collections even in derived classes
                delattr(cls, name)

        # XXX can we first collect parent's ca and then populate with ours?
        # TODO

        for base in bases:
            if hasattr(base, "__metaclass__") and \
                   base.__metaclass__ == AttributesCollector:
                # TODO take care about overriding one from super class
                # for state in base.ca:
                #    if state[0] =
                newcollections = base._collections_template
                if len(newcollections) == 0:
                    continue
                if __debug__: # XXX RF:  and "COLR" in debug.active:
                    debug("COLR",
                          "Collect collections %s for %s from %s" %
                          (newcollections, cls, base))
                for col, collection in newcollections.iteritems():
                    if collections.has_key(col):
                        collections[col].update(collection)
                    else:
                        collections[col] = collection


        if __debug__:
            debug("COLR",
                  "Creating ConditionalAttributesCollection template %s with collections %s"
                  % (cls, collections.keys()))

        # if there is an explicit
        if hasattr(cls, "_ATTRIBUTE_COLLECTIONS"):
            for col in cls._ATTRIBUTE_COLLECTIONS:
                if not col in _col2class:
                    raise ValueError, \
                          "Requested collection %s is unknown to collector" % \
                          col
                if not col in collections:
                    collections[col] = None

        # TODO: check on conflict in names of Collections' items!  since
        # otherwise even order is not definite since we use dict for
        # collections.
        # XXX should we switch to tuple?

        for col, colitems in collections.iteritems():
            # so far we collected the collection items in a dict, but the new
            # API requires to pass a _list_ of collectables instead of a dict.
            # So, whenever there are items, we pass just the values of the dict.
            # There is no information last, since the keys of the dict are the
            # name attributes of each collectable in the list.
            if not colitems is None:
                collections[col] = _col2class[col](items=colitems.values())
            else:
                collections[col] = _col2class[col]()

        setattr(cls, "_collections_template", collections)

        #
        # Expand documentation for the class based on the listed
        # parameters an if it is stateful
        #
        # TODO -- figure nice way on how to alter __init__ doc directly...
        textwrapper = TextWrapper(subsequent_indent="    ",
                                  initial_indent="    ",
                                  width=70)

        # Parameters
        paramsdoc = []
        paramscols = []
        for col in ('params', 'kernel_params'):
            if collections.has_key(col):
                paramscols.append(col)
                # lets at least sort the parameters for consistent output
                col_items = collections[col]
                iparams = [(v._instance_index, k)
                           for k,v in col_items.iteritems()]
                iparams.sort()
                paramsdoc += [(col_items[iparam[1]].name,
                               col_items[iparam[1]]._paramdoc())
                              for iparam in iparams]

        # Parameters collection could be taked hash of to decide if
        # any were changed? XXX may be not needed at all?
        setattr(cls, "_paramscols", paramscols)

        # States doc
        cadoc = ""
        if collections.has_key('ca'):
            paramsdoc += [
                ('enable_ca',
                 "enable_ca : None or list of str\n  "
                 "Names of the conditional attributes which should "
                 "be enabled in addition\n  to the default ones"),
                ('disable_ca',
                 "disable_ca : None or list of str\n  "
                 "Names of the conditional attributes which should "
                 "be disabled""")]
            if len(collections['ca']):
                cadoc += '\n'.join(['* ' + x
                                    for x in collections['ca'].listing])
                cadoc += "\n\n(Conditional attributes enabled by default suffixed with `+`)"
            if __debug__:
                debug("COLR", "Assigning __cadoc to be %s" % cadoc)
            setattr(cls, "_cadoc", cadoc)

        if paramsdoc != "":
            if __debug__ and 'COLR' in debug.active:
                debug("COLR", "Assigning __paramsdoc to be %s" % paramsdoc)
            setattr(cls, "_paramsdoc", paramsdoc)

        if len(paramsdoc) or cadoc != "":
            cls.__doc__ = enhanced_doc_string(cls, *bases)
Ejemplo n.º 15
0
class Splitter(object):
    """Base class of dataset splitters.

    Each splitter should be initialized with all its necessary parameters. The
    final splitting is done running the splitter object on a certain Dataset
    via __call__(). This method has to be implemented like a generator, i.e. it
    has to return every possible split with a yield() call.

    Each split has to be returned as a sequence of Datasets. The properties
    of the splitted dataset may vary between implementations. It is possible
    to declare a sequence element as 'None'.

    Please note, that even if there is only one Dataset returned it has to be
    an element in a sequence and not just the Dataset object!
    """

    _STRATEGIES = ('first', 'random', 'equidistant')
    _NPERLABEL_STR = ['equal', 'all']

    def __init__(self,
                 npertarget='all',
                 nrunspersplit=1,
                 permute_attr=None,
                 count=None,
                 strategy='equidistant',
                 discard_boundary=None,
                 attr='chunks',
                 reverse=False,
                 noslicing=False):
        """Initialize splitter base.

        Parameters
        ----------
        npertarget : int or str (or list of them) or float
          Number of dataset samples per label to be included in each
          split. If given as a float, it must be in [0,1] range and would
          mean the ratio of selected samples per each label.
          Two special strings are recognized: 'all' uses all available
          samples (default) and 'equal' uses the maximum number of samples
          the can be provided by all of the classes. This value might be
          provided as a sequence length of which matches the number of datasets
          per split and indicates the configuration for the respective dataset
          in each split.
        nrunspersplit : int
          Number of times samples for each split are chosen. This
          is mostly useful if a subset of the available samples
          is used in each split and the subset is randomly
          selected for each run (see the `npertarget` argument).
        permute_attr : None or str
          If set to a string (e.g. 'targets'), the corresponding .sa
          of each generated dataset will be permuted on a per-chunk
          basis.
        count : None or int
          Desired number of splits to be output. It is limited by the
          number of splits possible for a given splitter
          (e.g. `OddEvenSplitter` can have only up to 2 splits). If None,
          all splits are output (default).
        strategy : str
          If `count` is not None, possible strategies are possible:
          'first': First `count` splits are chosen;
          'random': Random (without replacement) `count` splits are chosen;
          'equidistant': Splits which are equidistant from each other.
        discard_boundary : None or int or sequence of int
          If not `None`, how many samples on the boundaries between
          parts of the split to discard in the training part.
          If int, then discarded in all parts.  If a sequence, numbers
          to discard are given per part of the split.
          E.g. if splitter splits only into (training, testing)
          parts, then `discard_boundary=(2,0)` would instruct to discard
          2 samples from training which are on the boundary with testing.
        attr : str
          Sample attribute used to determine splits.
        reverse : bool
          If True, the order of datasets in the split is reversed, e.g.
          instead of (training, testing), (training, testing) will be spit
          out
        noslicing : bool
          If True, dataset splitting is not done by slicing (causing
          shared data between source and split datasets) even if it would
          be possible. By default slicing is performed whenever possible
          to reduce the memory footprint.
        """
        # pylint happyness block
        self.__npertarget = None
        self.__runspersplit = nrunspersplit
        self.__permute_attr = permute_attr
        self.__splitattr = attr
        self.__noslicing = noslicing
        self._reverse = reverse
        self.discard_boundary = discard_boundary

        # we don't check it, thus no reason to make it private.
        # someone might find it useful to change post creation
        # TODO utilize such (or similar) policy through out the code
        self.count = count
        """Number (max) of splits to output on call"""

        self._set_strategy(strategy)

        # pattern sampling status vars
        self.set_n_per_label(npertarget)

    __doc__ = enhanced_doc_string('Splitter', locals())

    ##REF: Name was automagically refactored
    def _set_strategy(self, strategy):
        """Set strategy to select splits out from available
        """
        strategy = strategy.lower()
        if not strategy in self._STRATEGIES:
            raise ValueError, "strategy is not known. Known are %s" \
                  % str(self._STRATEGIES)
        self.__strategy = strategy

    ##REF: Name was automagically refactored
    def set_n_per_label(self, value):
        """Set the number of samples per label in the split datasets.

        'equal' sets sample size to highest possible number of samples that
        can be provided by each class. 'all' uses all available samples
        (default).
        """
        if isinstance(value, basestring):
            if not value in self._NPERLABEL_STR:
                raise ValueError, "Unsupported value '%s' for npertarget." \
                      " Supported ones are %s or float or int" \
                      % (value, self._NPERLABEL_STR)
        self.__npertarget = value

    ##REF: Name was automagically refactored
    def _get_split_config(self, uniqueattr):
        """Return list with samples of 2nd dataset in a split.

        Each subclass has to implement this method. It gets a sequence with
        the unique attribute ids of a dataset and has to return a list of lists
        containing sample ids to split into the second dataset.
        """
        raise NotImplementedError

    def __call__(self, dataset):
        """Splits the dataset.

        This method behaves like a generator.
        """

        # local bindings to methods to gain some speedup
        ds_class = dataset.__class__

        # for each split
        cfgs = self.splitcfg(dataset)
        n_cfgs = len(cfgs)

        # Finally split the data
        for isplit, split in enumerate(cfgs):

            # determine sample sizes
            if not operator.isSequenceType(self.__npertarget) \
                   or isinstance(self.__npertarget, str):
                npertargetsplit = [self.__npertarget] * len(split)
            else:
                npertargetsplit = self.__npertarget

            # get splitted datasets
            split_ds = self.split_dataset(dataset, split)

            # do multiple post-processing runs for this split
            for run in xrange(self.__runspersplit):

                # post-process all datasets
                finalized_datasets = []

                for ds, npertarget in zip(split_ds, npertargetsplit):
                    # Set flag of dataset either this was the last split
                    # ??? per our discussion this might be the best
                    #     solution which would scale if we care about
                    #     thread-safety etc
                    if ds is not None:
                        ds_a = ds.a
                        lastsplit = (isplit == n_cfgs - 1)
                        if not ds_a.has_key('lastsplit'):
                            # if not yet known -- add one
                            ds_a['lastsplit'] = lastsplit
                        else:
                            # otherwise just assign a new value
                            ds_a.lastsplit = lastsplit
                    # permute the labels
                    if self.__permute_attr is not None:
                        permute_attr(ds,
                                     attr=self.__permute_attr,
                                     chunks_attr='chunks',
                                     col='sa')

                    # select subset of samples if requested
                    if npertarget == 'all' or ds is None:
                        finalized_datasets.append(ds)
                    else:
                        # We need to select a subset of samples
                        # TODO: move all this logic within random_sample

                        # go for maximum possible number of samples provided
                        # by each label in this dataset
                        if npertarget == 'equal':
                            # determine the min number of samples per class
                            npl = np.array(
                                get_nsamples_per_attr(
                                    ds, 'targets').values()).min()
                        elif isinstance(npertarget, float) or (
                                operator.isSequenceType(npertarget)
                                and len(npertarget) > 0
                                and isinstance(npertarget[0], float)):
                            # determine number of samples per class and take
                            # a ratio
                            counts = np.array(
                                get_nsamples_per_attr(ds, 'targets').values())
                            npl = (counts * npertarget).round().astype(int)
                        else:
                            npl = npertarget

                        # finally select the patterns
                        finalized_datasets.append(random_samples(ds, npl))

                if self._reverse:
                    yield finalized_datasets[::-1]
                else:
                    yield finalized_datasets

    ##REF: Name was automagically refactored
    def split_dataset(self, dataset, specs):
        """Split a dataset by separating the samples where the configured
        sample attribute matches an element of `specs`.

        Parameters
        ----------
        dataset : Dataset
          This is this source dataset.
        specs : sequence of sequences
          Contains ids of a sample attribute that shall be split into the
          another dataset.

        Returns
        -------
        Tuple of splitted datasets.
        """
        # collect the sample ids for each resulting dataset
        filters = []
        none_specs = 0
        cum_filter = None

        # Prepare discard_boundary
        discard_boundary = self.discard_boundary
        if isinstance(discard_boundary, int):
            if discard_boundary != 0:
                discard_boundary = (discard_boundary, ) * len(specs)
            else:
                discard_boundary = None

        splitattr_data = dataset.sa[self.__splitattr].value
        for spec in specs:
            if spec is None:
                filters.append(None)
                none_specs += 1
            else:
                filter_ = np.array([ i in spec \
                                    for i in splitattr_data], dtype='bool')
                filters.append(filter_)
                if cum_filter is None:
                    cum_filter = filter_
                else:
                    cum_filter = np.logical_and(cum_filter, filter_)

        # need to turn possible Nones into proper ids sequences
        if none_specs > 1:
            raise ValueError, "Splitter cannot handle more than one `None` " \
                              "split definition."

        for i, filter_ in enumerate(filters):
            if filter_ is None:
                filters[i] = np.logical_not(cum_filter)

            # If it was told to discard samples on the boundary to the
            # other parts of the split
            if discard_boundary is not None:
                ndiscard = discard_boundary[i]
                if ndiscard != 0:
                    # XXX sloppy implementation for now. It still
                    # should not be the main reason for a slow-down of
                    # the whole analysis ;)
                    f, lenf = filters[i], len(filters[i])
                    f_pad = np.concatenate(
                        ([True] * ndiscard, f, [True] * ndiscard))
                    for d in xrange(2 * ndiscard + 1):
                        f = np.logical_and(f, f_pad[d:d + lenf])
                    filters[i] = f[:]

        # split data: return None if no samples are left
        # XXX: Maybe it should simply return an empty dataset instead, but
        #      keeping it this way for now, to maintain current behavior
        split_datasets = []

        for filter_ in filters:
            if (filter_ == False).all():
                split_datasets.append(None)
            else:
                # check whether we can do slicing instead of advanced
                # indexing -- if we can split the dataset without causing
                # the data to be copied, its is quicker and leaner.
                # However, it only works if we have a contiguous chunk or
                # regular step sizes for the samples to be split
                split_datasets.append(dataset[self._filter2slice(filter_)])

        return split_datasets

    def _filter2slice(self, bf):
        if self.__noslicing:
            # we are not allowed to help :-(
            return bf
        # the filter should be a boolean array
        if not len(bf):
            raise ValueError("'%s' recieved an empty filter. This is a "
                             "bug." % self.__class__.__name__)
        # get indices of non-zero filter elements
        idx = bf.nonzero()[0]
        idx_start = idx[0]
        idx_end = idx[-1] + 1
        idx_step = None
        if len(idx) > 1:
            # we need to figure out if there is a regular step-size
            # between elements
            stepsizes = np.unique(idx[1:] - idx[:-1])
            if len(stepsizes) > 1:
                # multiple step-sizes -> slicing is not possible -> return
                # orginal filter
                return bf
            else:
                idx_step = stepsizes[0]

        sl = slice(idx_start, idx_end, idx_step)
        if __debug__:
            debug(
                "SPL", "Splitting by basic slicing is possible and permitted "
                "(%s)." % sl)
        return sl

    def __str__(self):
        """String summary over the object
        """
        return \
          "SplitterConfig: npertarget:%s runs-per-split:%d permute_attr:%s" \
          % (self.__npertarget, self.__runspersplit, self.__permute_attr)

    def splitcfg(self, dataset):
        """Return splitcfg for a given dataset"""
        cfgs = self._get_split_config(dataset.sa[self.__splitattr].unique)

        # Select just some splits if desired
        count, n_cfgs = self.count, len(cfgs)

        # further makes sense only iff count < n_cfgs,
        # otherwise all strategies are equivalent
        if count is not None and count < n_cfgs:
            if count < 1:
                # we can only wish a good luck
                return []
            strategy = self.strategy
            if strategy == 'first':
                cfgs = cfgs[:count]
            elif strategy in ['equidistant', 'random']:
                if strategy == 'equidistant':
                    # figure out what step is needed to
                    # accommodate the `count` number
                    step = float(n_cfgs) / count
                    assert (step >= 1.0)
                    indexes = [int(round(step * i)) for i in xrange(count)]
                elif strategy == 'random':
                    indexes = np.random.permutation(range(n_cfgs))[:count]
                    # doesn't matter much but lets keep them in the original
                    # order at least
                    indexes.sort()
                else:
                    # who said that I am paranoid?
                    raise RuntimeError, "Really should not happen"
                if __debug__:
                    debug(
                        "SPL", "For %s strategy selected %s splits "
                        "from %d total" % (strategy, indexes, n_cfgs))
                cfgs = [cfgs[i] for i in indexes]

        return cfgs

    strategy = property(fget=lambda self: self.__strategy, fset=_set_strategy)
    splitattr = property(fget=lambda self: self.__splitattr)
    permute_attr = property(fget=lambda self: self.__permute_attr)
    npertarget = property(fget=lambda self: self.__npertarget)
Ejemplo n.º 16
0
    def __init__(cls, name, bases, dict):
        """
        Parameters
        ----------
        name : str
          Name of the class
        bases : iterable
          Base classes
        dict : dict
          Attributes.
        """
        if __debug__:
            debug(
                "COLR",
                "AttributesCollector call for %s.%s, where bases=%s, dict=%s " \
                % (cls, name, bases, dict))

        super(AttributesCollector, cls).__init__(name, bases, dict)

        collections = {}
        for name, value in dict.iteritems():
            if isinstance(value, IndexedCollectable):
                baseclassname = value.__class__.__name__
                col = _known_collections[baseclassname][0]
                # XXX should we allow to throw exceptions here?
                if not collections.has_key(col):
                    collections[col] = {}
                collections[col][name] = value
                # and assign name if not yet was set
                if value.name is None:
                    value.name = name
                # !!! We do not keep copy of this attribute static in the class.
                #     Due to below traversal of base classes, we should be
                #     able to construct proper collections even in derived classes
                delattr(cls, name)

        # XXX can we first collect parent's ca and then populate with ours?
        # TODO

        for base in bases:
            if hasattr(base, "__metaclass__") and \
                   base.__metaclass__ == AttributesCollector:
                # TODO take care about overriding one from super class
                # for state in base.ca:
                #    if state[0] =
                newcollections = base._collections_template
                if len(newcollections) == 0:
                    continue
                if __debug__:  # XXX RF:  and "COLR" in debug.active:
                    debug(
                        "COLR", "Collect collections %s for %s from %s" %
                        (newcollections, cls, base))
                for col, collection in newcollections.iteritems():
                    if collections.has_key(col):
                        collections[col].update(collection)
                    else:
                        collections[col] = collection

        if __debug__:
            debug(
                "COLR",
                "Creating ConditionalAttributesCollection template %s with collections %s"
                % (cls, collections.keys()))

        # if there is an explicit
        if hasattr(cls, "_ATTRIBUTE_COLLECTIONS"):
            for col in cls._ATTRIBUTE_COLLECTIONS:
                if not col in _col2class:
                    raise ValueError, \
                          "Requested collection %s is unknown to collector" % \
                          col
                if not col in collections:
                    collections[col] = None

        # TODO: check on conflict in names of Collections' items!  since
        # otherwise even order is not definite since we use dict for
        # collections.
        # XXX should we switch to tuple?

        for col, colitems in collections.iteritems():
            # so far we collected the collection items in a dict, but the new
            # API requires to pass a _list_ of collectables instead of a dict.
            # So, whenever there are items, we pass just the values of the dict.
            # There is no information last, since the keys of the dict are the
            # name attributes of each collectable in the list.
            if not colitems is None:
                collections[col] = _col2class[col](items=colitems.values())
            else:
                collections[col] = _col2class[col]()

        setattr(cls, "_collections_template", collections)

        #
        # Expand documentation for the class based on the listed
        # parameters an if it is stateful
        #
        # TODO -- figure nice way on how to alter __init__ doc directly...
        textwrapper = TextWrapper(subsequent_indent="    ",
                                  initial_indent="    ",
                                  width=70)

        # Parameters
        paramsdoc = []
        paramscols = []
        for col in ('params', 'kernel_params'):
            if collections.has_key(col):
                paramscols.append(col)
                # lets at least sort the parameters for consistent output
                col_items = collections[col]
                iparams = [(v._instance_index, k)
                           for k, v in col_items.iteritems()]
                iparams.sort()
                paramsdoc += [(col_items[iparam[1]].name,
                               col_items[iparam[1]]._paramdoc())
                              for iparam in iparams]

        # Parameters collection could be taked hash of to decide if
        # any were changed? XXX may be not needed at all?
        setattr(cls, "_paramscols", paramscols)

        # States doc
        cadoc = ""
        if collections.has_key('ca'):
            paramsdoc += [('enable_ca', "enable_ca : None or list of str\n  "
                           "Names of the conditional attributes which should "
                           "be enabled in addition\n  to the default ones"),
                          ('disable_ca', "disable_ca : None or list of str\n  "
                           "Names of the conditional attributes which should "
                           "be disabled"
                           "")]
            if len(collections['ca']):
                cadoc += '\n'.join(
                    ['* ' + x for x in collections['ca'].listing])
                cadoc += "\n\n(Conditional attributes enabled by default suffixed with `+`)"
            if __debug__:
                debug("COLR", "Assigning __cadoc to be %s" % cadoc)
            setattr(cls, "_cadoc", cadoc)

        if paramsdoc != "":
            if __debug__ and 'COLR' in debug.active:
                debug("COLR", "Assigning __paramsdoc to be %s" % paramsdoc)
            setattr(cls, "_paramsdoc", paramsdoc)

        if len(paramsdoc) or cadoc != "":
            cls.__doc__ = enhanced_doc_string(cls, *bases)
Ejemplo n.º 17
0
class DatasetMeasure(ClassWithCollections):
    """A measure computed from a `Dataset`

    All dataset measures support arbitrary transformation of the measure
    after it has been computed. Transformation are done by processing the
    measure with a functor that is specified via the `transformer` keyword
    argument of the constructor. Upon request, the raw measure (before
    transformations are applied) is stored in the `raw_results` conditional attribute.

    Additionally all dataset measures support the estimation of the
    probabilit(y,ies) of a measure under some distribution. Typically this will
    be the NULL distribution (no signal), that can be estimated with
    permutation tests. If a distribution estimator instance is passed to the
    `null_dist` keyword argument of the constructor the respective
    probabilities are automatically computed and stored in the `null_prob`
    conditional attribute.

    Notes
    -----
    For developers: All subclasses shall get all necessary parameters via
    their constructor, so it is possible to get the same type of measure for
    multiple datasets by passing them to the __call__() method successively.

    """

    raw_results = ConditionalAttribute(enabled=False,
        doc="Computed results before applying any " +
            "transformation algorithm")
    null_prob = ConditionalAttribute(enabled=True)
    """Stores the probability of a measure under the NULL hypothesis"""
    null_t = ConditionalAttribute(enabled=False)
    """Stores the t-score corresponding to null_prob under assumption
    of Normal distribution"""

    def __init__(self, postproc=None, null_dist=None, **kwargs):
        """Does nothing special.

        Parameters
        ----------
        postproc : Mapper instance
          Mapper to perform post-processing of results. This mapper is applied
          in `__call__()` to perform a final processing step on the to be
          returned dataset measure. If None, nothing is done.
        null_dist : instance of distribution estimator
          The estimated distribution is used to assign a probability for a
          certain value of the computed measure.
        """
        ClassWithCollections.__init__(self, **kwargs)

        self.__postproc = postproc
        """Functor to be called in return statement of all subclass __call__()
        methods."""
        null_dist_ = auto_null_dist(null_dist)
        if __debug__:
            debug('SA', 'Assigning null_dist %s whenever original given was %s'
                  % (null_dist_, null_dist))
        self.__null_dist = null_dist_


    __doc__ = enhanced_doc_string('DatasetMeasure', locals(),
                                  ClassWithCollections)


    def __call__(self, dataset):
        """Compute measure on a given `Dataset`.

        Each implementation has to handle a single arguments: the source
        dataset.

        Returns the computed measure in some iterable (list-like)
        container applying a post-processing mapper if such is defined.
        """
        result = self._call(dataset)
        result = self._postcall(dataset, result)

        # XXX Remove when "sensitivity-return-dataset" transition is done
        if __debug__ \
           and not isinstance(result, AttrDataset) \
           and not len(result.shape) == 1:
            warning("Postprocessing of '%s' doesn't return a Dataset, or "
                    "1D-array (got: '%s')."
                    % (self.__class__.__name__, result))
        return result


    def _call(self, dataset):
        """Actually compute measure on a given `Dataset`.

        Each implementation has to handle a single arguments: the source
        dataset.

        Returns the computed measure in some iterable (list-like) container.
        """
        raise NotImplemented


    def _postcall(self, dataset, result):
        """Some postprocessing on the result
        """
        self.ca.raw_results = result

        # post-processing
        if not self.__postproc is None:
            if __debug__:
                debug("SA_", "Applying mapper %s" % self.__postproc)
            result = self.__postproc.forward(result)

        # estimate the NULL distribution when functor is given
        if not self.__null_dist is None:
            if __debug__:
                debug("SA_", "Estimating NULL distribution using %s"
                      % self.__null_dist)

            # we need a matching datameasure instance, but we have to disable
            # the estimation of the null distribution in that child to prevent
            # infinite looping.
            measure = copy.copy(self)
            measure.__null_dist = None
            self.__null_dist.fit(measure, dataset)

            if self.ca.is_enabled('null_t'):
                # get probability under NULL hyp, but also request
                # either it belong to the right tail
                null_prob, null_right_tail = \
                           self.__null_dist.p(result, return_tails=True)
                self.ca.null_prob = null_prob

                externals.exists('scipy', raise_=True)
                from scipy.stats import norm

                # TODO: following logic should appear in NullDist,
                #       not here
                tail = self.null_dist.tail
                if tail == 'left':
                    acdf = np.abs(null_prob)
                elif tail == 'right':
                    acdf = 1.0 - np.abs(null_prob)
                elif tail in ['any', 'both']:
                    acdf = 1.0 - np.clip(np.abs(null_prob), 0, 0.5)
                else:
                    raise RuntimeError, 'Unhandled tail %s' % tail
                # We need to clip to avoid non-informative inf's ;-)
                # that happens due to lack of precision in mantissa
                # which is 11 bits in double. We could clip values
                # around 0 at as low as 1e-100 (correspond to z~=21),
                # but for consistency lets clip at 1e-16 which leads
                # to distinguishable value around p=1 and max z=8.2.
                # Should be sufficient range of z-values ;-)
                clip = 1e-16
                null_t = norm.ppf(np.clip(acdf, clip, 1.0 - clip))
                # assure that we deal with arrays:
                null_t = np.array(null_t, ndmin=1, copy=False)
                null_t[~null_right_tail] *= -1.0 # revert sign for negatives
                self.ca.null_t = null_t          # store
            else:
                # get probability of result under NULL hypothesis if available
                # and don't request tail information
                self.ca.null_prob = self.__null_dist.p(result)

        return result


    def __repr__(self, prefixes=[]):
        """String representation of a `DatasetMeasure`

        Includes only arguments which differ from default ones
        """
        prefixes = prefixes[:]
        if self.__postproc is not None:
            prefixes.append("postproc=%s" % self.__postproc)
        if self.__null_dist is not None:
            prefixes.append("null_dist=%s" % self.__null_dist)
        return super(DatasetMeasure, self).__repr__(prefixes=prefixes)

    def untrain(self):
        """'Untraining' Measure

        Some derived classes might used classifiers, so we need to
        untrain those
        """
        pass

    @property
    def null_dist(self):
        """Return Null Distribution estimator"""
        return self.__null_dist

    @property
    def postproc(self):
        """Return mapper"""
        return self.__postproc