class HalfSplitter(Splitter): """Split a dataset into two halves of the sample attribute. The splitter yields to splits: first (1st half, 2nd half) and second (2nd half, 1st half). """ def __init__(self, **kwargs): Splitter.__init__(self, **(kwargs)) __doc__ = enhanced_doc_string('HalfSplitter', locals(), Splitter) ##REF: Name was automagically refactored def _get_split_config(self, uniqueattrs): """ Returns ------- list of tuples (None, list of int) 2 items: first half of samples into 1st split """ return [(None, uniqueattrs[:len(uniqueattrs) / 2]), (None, uniqueattrs[len(uniqueattrs) / 2:])] def __str__(self): """String summary over the object """ return \ "HalfSplitter / " + Splitter.__str__(self)
class PyMVPAAtlas(XMLBasedAtlas): """Base class for PyMVPA atlases, such as LabelsAtlas and ReferenceAtlas """ source = 'PyMVPA' def __init__(self, *args, **kwargs): XMLBasedAtlas.__init__(self, *args, **kwargs) # sanity checks header = self.header headerChildrenTags = XMLBasedAtlas._children_tags(header) if not ('space' in headerChildrenTags) or \ not ('space-flavor' in headerChildrenTags): raise XMLAtlasException("PyMVPA Atlas requires specification of" + " the space in which atlas resides") self.__space = header.space.text self.__spaceFlavor = header['space-flavor'].text __doc__ = enhanced_doc_string('PyMVPAAtlas', locals(), XMLBasedAtlas) ##REF: Name was automagically refactored def _load_images(self): # shortcut imagefile = self.header.images.imagefile #self.nlevels = len(self._levels_by_id) # Set offset if defined in XML file # XXX: should just take one from the qoffset... now that one is # defined... this origin might be misleading actually self._origin = np.array((0, 0, 0)) if imagefile.attrib.has_key('offset'): self._origin = np.array( [int(x) for x in imagefile.get('offset').split(',')]) # Load the image file which has labels if self._force_image_file is not None: imagefilename = self._force_image_file else: imagefilename = imagefile.text imagefilename = reuse_absolute_path(self._filename, imagefilename) try: self._image = NiftiImage(imagefilename) except RuntimeError, e: raise RuntimeError, \ " Cannot open file %s due to %s" % (imagefilename, e) self._data = self._image.data # remove bogus dimensions on top of 4th if len(self._data.shape[0:-4]) > 0: bogus_dims = self._data.shape[0:-4] if max(bogus_dims) > 1: raise RuntimeError, "Atlas %s has more than 4 of non-singular" \ "dimensions" % imagefilename new_shape = self._data.shape[-4:] self._data.reshape(new_shape)
class NFoldSplitter(Splitter): """Generic N-fold data splitter. Provide folding splitting. Given a dataset with N chunks, with cvtype=1 (which is default), it would generate N splits, where each chunk sequentially is taken out (with replacement) for cross-validation. Example, if there is 4 chunks, splits for cvtype=1 are:: [[1, 2, 3], [0]] [[0, 2, 3], [1]] [[0, 1, 3], [2]] [[0, 1, 2], [3]] If cvtype>1, then all possible combinations of cvtype number of chunks are taken out for testing, so for cvtype=2 in previous example:: [[2, 3], [0, 1]] [[1, 3], [0, 2]] [[1, 2], [0, 3]] [[0, 3], [1, 2]] [[0, 2], [1, 3]] [[0, 1], [2, 3]] """ def __init__(self, cvtype=1, **kwargs): """Initialize the N-fold splitter. Parameters ---------- cvtype : int Type of cross-validation: N-(cvtype) **kwargs Additional parameters are passed to the `Splitter` base class. """ Splitter.__init__(self, **(kwargs)) # pylint happiness block self.__cvtype = cvtype __doc__ = enhanced_doc_string('NFoldSplitter', locals(), Splitter) def __str__(self): """String summary over the object """ return \ "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self) def _get_split_config(self, uniqueattrs): """Returns proper split configuration for N-M fold split. """ return [(None, i) for i in \ support.xunique_combinations(uniqueattrs, self.__cvtype)]
class NGroupSplitter(Splitter): """Split a dataset into N-groups of the sample attribute. For example, NGroupSplitter(2) is the same as the HalfSplitter and yields to splits: first (1st half, 2nd half) and second (2nd half, 1st half). """ def __init__(self, ngroups=4, **kwargs): """Initialize the N-group splitter. Parameters ---------- ngroups : int Number of groups to split the attribute into. **kwargs Additional parameters are passed to the `Splitter` base class. """ Splitter.__init__(self, **(kwargs)) self.__ngroups = ngroups __doc__ = enhanced_doc_string('NGroupSplitter', locals(), Splitter) ##REF: Name was automagically refactored def _get_split_config(self, uniqueattrs): """ Returns ------- list of tuples (None, list of int) Indices for splitting """ # make sure there are more of attributes than desired groups if len(uniqueattrs) < self.__ngroups: raise ValueError, "Number of groups (%d) " % (self.__ngroups) + \ "must be less than " + \ "or equal to the number of unique attributes (%d)" % \ (len(uniqueattrs)) # use coarsen_chunks to get the split indices split_ind = coarsen_chunks(uniqueattrs, nchunks=self.__ngroups) split_ind = np.asarray(split_ind) # loop and create splits split_list = [(None, uniqueattrs[split_ind == i]) for i in range(self.__ngroups)] return split_list def __str__(self): """String summary over the object """ return \ "N-%d-GroupSplitter / " % self.__ngroup + Splitter.__str__(self)
class CustomSplitter(Splitter): """Split a dataset using an arbitrary custom rule. The splitter is configured by passing a custom spitting rule (`splitrule`) to its constructor. Such a rule is basically a sequence of split definitions. Every single element in this sequence results in excatly one split generated by the Splitter. Each element is another sequence for sequences of sample ids for each dataset that shall be generated in the split. Examples -------- Generate two splits. In the first split the *second* dataset contains all samples with sample attributes corresponding to either 0, 1 or 2. The *first* dataset of the first split contains all samples which are not split into the second dataset. The second split yields three datasets. The first with all samples corresponding to sample attributes 1 and 2, the second dataset contains only samples with attrbiute 3 and the last dataset contains the samples with attribute 5 and 6. >>> splitter = CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])]) """ def __init__(self, splitrule, **kwargs): """ Parameters ---------- splitrule : list of tuple Custom splits to use """ Splitter.__init__(self, **(kwargs)) self.__splitrule = splitrule __doc__ = enhanced_doc_string('CustomSplitter', locals(), Splitter) ##REF: Name was automagically refactored def _get_split_config(self, uniqueattrs): """ Returns ------- whatever was provided in splitrule argument """ return self.__splitrule def __str__(self): """String summary over the object """ return "CustomSplitter / " + Splitter.__str__(self)
class OddEvenSplitter(Splitter): """Split a dataset into odd and even values of the sample attribute. The splitter yields to splits: first (odd, even) and second (even, odd). """ def __init__(self, usevalues=False, **kwargs): """ Parameters ---------- usevalues : bool If True the values of the attribute used for splitting will be used to determine odd and even samples. If False odd and even chunks are defined by the order of attribute values, i.e. first unique attribute is odd, second is even, despite the corresponding values might indicate the opposite (e.g. in case of [2,3]. """ Splitter.__init__(self, **(kwargs)) self.__usevalues = usevalues __doc__ = enhanced_doc_string('OddEvenSplitter', locals(), Splitter) ##REF: Name was automagically refactored def _get_split_config(self, uniqueattrs): """ Returns ------- list of tuples (None, list of int) 2 items: odd samples into 1st split """ if self.__usevalues: return [(None, uniqueattrs[(uniqueattrs % 2) == True]), (None, uniqueattrs[(uniqueattrs % 2) == False])] else: return [ (None, uniqueattrs[np.arange(len(uniqueattrs)) % 2 == True]), (None, uniqueattrs[np.arange(len(uniqueattrs)) % 2 == False]) ] def __str__(self): """String summary over the object """ return \ "OddEvenSplitter / " + Splitter.__str__(self)
class NoneSplitter(Splitter): """This is a dataset splitter that does **not** split. It simply returns the full dataset that it is called with. The passed dataset is returned as the second element of the 2-tuple. The first element of that tuple will always be 'None'. """ _known_modes = ['first', 'second'] def __init__(self, mode='second', **kwargs): """ Parameters ---------- mode Either 'first' or 'second' (default) -- which output dataset would actually contain the samples """ Splitter.__init__(self, **(kwargs)) if not mode in NoneSplitter._known_modes: raise ValueError, "Unknown mode %s for NoneSplitter" % mode self.__mode = mode __doc__ = enhanced_doc_string('NoneSplitter', locals(), Splitter) ##REF: Name was automagically refactored def _get_split_config(self, uniqueattrs): """Return just one full split: no first or second dataset. """ if self.__mode == 'second': return [([], None)] else: return [(None, [])] def __str__(self): """String summary over the object """ return \ "NoneSplitter / " + Splitter.__str__(self)
class LabelsAtlas(PyMVPAAtlas): """ Atlas which provides labels for the given coordinate """ ##REF: Name was automagically refactored def label_voxel(self, c, levels=None): """ Return labels for the given voxel at specified levels specified by index """ levels = self._get_selected_levels(levels=levels) result = {'voxel_queried': c} # check range c = self._check_range(c) resultLevels = [] for level in levels: if self._levels.has_key(level): level_ = self._levels[level] else: raise IndexError("Unknown index or description for level %d" % level) resultIndex = int(self._data[ level_.index, \ c[2], c[1], c[0] ]) resultLevels += [{ 'index': level_.index, 'id': level_.description, 'label': level_[resultIndex] }] result['labels'] = resultLevels return result __doc__ = enhanced_doc_string('LabelsAtlas', locals(), PyMVPAAtlas)
class ProjectionMapper(Mapper): """Linear mapping between multidimensional spaces. This class cannot be used directly. Sub-classes have to implement the `_train()` method, which has to compute the projection matrix `_proj` and optionally offset vectors `_offset_in` and `_offset_out` (if initialized with demean=True, which is default) given a dataset (see `_train()` docstring for more information). Once the projection matrix is available, this class provides functionality to perform forward and backwards linear mapping of data, the latter by default using pseudo-inverse (but could be altered in subclasses, like hermitian (conjugate) transpose in case of SVD). Additionally, `ProjectionMapper` supports optional selection of arbitrary component (i.e. columns of the projection matrix) of the projection. Forward and back-projection matrices (a.k.a. *projection* and *reconstruction*) are available via the `proj` and `recon` properties. """ _DEV__doc__ = """Think about renaming `demean`, may be `translation`?""" def __init__(self, selector=None, demean=True): """Initialize the ProjectionMapper Parameters ---------- selector : None or list Which components (i.e. columns of the projection matrix) should be used for mapping. If `selector` is `None` all components are used. If a list is provided, all list elements are treated as component ids and the respective components are selected (all others are discarded). demean : bool Either data should be demeaned while computing projections and applied back while doing reverse() """ Mapper.__init__(self) # by default we want to wipe the feature attributes out during mapping self._fa_filter = [] self._selector = selector self._proj = None """Forward projection matrix.""" self._recon = None """Reverse projection (reconstruction) matrix.""" self._demean = demean """Flag whether to demean the to be projected data, prior to projection. """ self._offset_in = None """Offset (most often just mean) in the input space""" self._offset_out = None """Offset (most often just mean) in the output space""" __doc__ = enhanced_doc_string('ProjectionMapper', locals(), Mapper) @accepts_dataset_as_samples def _pretrain(self, samples): """Determine the projection matrix. Parameters ---------- dataset : Dataset Dataset to operate on """ if self._demean: self._offset_in = samples.mean(axis=0) def _posttrain(self, dataset): # perform component selection if self._selector is not None: self.select_out(self._selector) ##REF: Name was automagically refactored def _demean_data(self, data): """Helper which optionally demeans """ if self._demean: # demean the training data data = data - self._offset_in if __debug__ and "MAP_" in debug.active: debug( "MAP_", "%s: Mean of data in input space %s was subtracted" % (self.__class__.__name__, self._offset_in)) return data def _forward_data(self, data): if self._proj is None: raise RuntimeError, "Mapper needs to be train before used." # local binding demean = self._demean d = np.asmatrix(data) # Remove input offset if present if demean and self._offset_in is not None: d = d - self._offset_in # Do forward projection res = (d * self._proj).A # Add output offset if present if demean and self._offset_out is not None: res += self._offset_out return res def _reverse_data(self, data): if self._proj is None: raise RuntimeError, "Mapper needs to be trained before used." d = np.asmatrix(data) # Remove offset if present in output space if self._demean and self._offset_out is not None: d = d - self._offset_out # Do reverse projection res = (d * self.recon).A # Add offset in input space if self._demean and self._offset_in is not None: res += self._offset_in return res ##REF: Name was automagically refactored def _compute_recon(self): """Given that a projection is present -- compute reconstruction matrix. By default -- pseudoinverse of projection matrix. Might be overridden in derived classes for efficiency. """ return np.linalg.pinv(self._proj) ##REF: Name was automagically refactored def _get_recon(self): """Compute (if necessary) and return reconstruction matrix """ # (re)build reconstruction matrix recon = self._recon if recon is None: self._recon = recon = self._compute_recon() return recon proj = property(fget=lambda self: self._proj, doc="Projection matrix") recon = property(fget=_get_recon, doc="Backprojection matrix")
class FSLAtlas(XMLBasedAtlas): """Base class for FSL atlases """ source = 'FSL' def __init__(self, *args, **kwargs): """ """ XMLBasedAtlas.__init__(self, *args, **kwargs) self.space = 'MNI' __doc__ = enhanced_doc_string('FSLAtlas', locals(), XMLBasedAtlas) ##REF: Name was automagically refactored def _load_images(self): resolution = self._resolution header = self.header images = header.images # Load present images # XXX might be refactored to avoid duplication of # effort with PyMVPAAtlas ni_image = None resolutions = [] if self._force_image_file is None: imagefile_candidates = [ reuse_absolute_path(self._filename, i.imagefile.text, force=True) for i in images ] else: imagefile_candidates = [self._force_image_file] for imagefilename in imagefile_candidates: try: ni_image_ = NiftiImage(imagefilename, load=False) except RuntimeError, e: raise RuntimeError, " Cannot open file " + imagefilename resolution_ = ni_image_.pixdim[0] if resolution is None: # select this one if the best if ni_image is None or \ resolution_ < ni_image.pixdim[0]: ni_image = ni_image_ self._image_file = imagefilename else: if resolution_ == resolution: ni_image = ni_image_ self._image_file = imagefilename break else: resolutions += [resolution_] # TODO: also make use of summaryimagefile may be? if ni_image is None: msg = "Could not find an appropriate atlas among %d atlases." \ % len(imagefile_candidates) if resolution is not None: msg += " Atlases had resolutions %s" % \ (resolutions,) raise RuntimeError, msg if __debug__: debug('ATL__', "Loading atlas data from %s" % self._image_file) self._image = ni_image self._resolution = ni_image.pixdim[0] self._origin = np.abs(ni_image.header['qoffset']) * 1.0 # XXX self._data = self._image.data
class FSLProbabilisticAtlas(FSLAtlas): """Probabilistic FSL atlases """ def __init__(self, thr=0.0, strategy='all', sort=True, *args, **kwargs): """ Parameters ---------- thr : float Value to threshold at strategy : str Possible values all - all entries above thr max - entry with maximal value sort : bool Either to sort entries for 'all' strategy according to probability """ FSLAtlas.__init__(self, *args, **kwargs) self.thr = thr self.strategy = strategy self.sort = sort __doc__ = enhanced_doc_string('FSLProbabilisticAtlas', locals(), FSLAtlas) ##REF: Name was automagically refactored def label_voxel(self, c, levels=None): """Return labels for the voxel Parameters ---------- c : tuple of coordinates (xyz) - levels : just for API consistency (heh heh). Must be 0 for FSL atlases """ if levels is not None and not (levels in [0, [0], (0, )]): raise ValueError, \ "I guess we don't support levels other than 0 in FSL atlas." \ " Got levels=%s" % (levels,) # check range c = self._check_range(c) # XXX think -- may be we should better assign each map to a # different level level = 0 resultLabels = [] for index, area in enumerate(self._levels[level]): prob = int(self._data[index, c[2], c[1], c[0]]) if prob > self.thr: resultLabels += [ dict( index=index, #id= label=area.text, prob=prob) ] if self.sort or self.strategy == 'max': resultLabels.sort(cmp=lambda x, y: cmp(x['prob'], y['prob']), reverse=True) if self.strategy == 'max': resultLabels = resultLabels[:1] elif self.strategy == 'all': pass else: raise ValueError, 'Unknown strategy %s' % self.strategy result = { 'voxel_queried': c, # in the list since we have only single level but # with multiple entries 'labels': [resultLabels] } return result def find(self, *args, **kwargs): """Just a shortcut to the only level. See :class:`~mvpa.atlases.base.Level.find` for more info """ return self.levels[0].find(*args, **kwargs) def get_map(self, target, strategy='unique', axes_order='xyz'): """Return a probability map as an array Parameters ---------- target : int or str or re._pattern_type If int, map for given index is returned. Otherwise, .find is called with ``unique=True`` to find matching area strategy : str in ('unique', 'max') If 'unique', then if multiple areas match, exception would be raised. In case of 'max', each voxel would get maximal value of probabilities from all matching areas axes_order : str in ('xyz', 'zyx') In what order axes of the returned array should follow. """ if isinstance(target, int): res = self._data[target] if axes_order == 'xyz': # ATM we store/access in zyx (kji) order, so we would need # to swap return res.T elif axes_order == 'zyx': return res else: raise ValueError, \ "Unknown axes_order=%r provided" % (axes_order,) else: lev = self.levels[0] # we have just 1 here if strategy == 'unique': return self.get_map(lev.find(target, unique=True).index, axes_order=axes_order) else: maps_dict = self.get_maps(target, axes_order=axes_order) maps = np.array(maps_dict.values()) return np.max(maps, axis=0) def get_maps(self, target, axes_order='xyz', key_attr=None, overlaps=None): """Return a dictionary of probability maps for the target Each key is a `Label` instance, and value is the probability map Parameters ---------- target : str or re._pattern_type .find is called with a target and unique=False to find all matches axes_order : str in ('xyz', 'zyx') In what order axes of the returned array should follow. key_attr : None or str What to use for the keys of the dictionary. If None, `Label` instance would be used as a key. If some attribute provided (e.g. 'text', 'abbr', 'index'), corresponding attribute of the `Label` instance would be taken as a key. overlaps : None or {'max'} How to treat overlaps in maps. If None, nothing is done and maps might have overlaps. If 'max', then maps would not overlap and competing maps will be resolved based on maximal value (e.g. if maps contain probabilities). """ lev = self.levels[0] # we have just 1 here if key_attr is None: key_gen = lambda x: x else: key_gen = lambda x: getattr(x, key_attr) res = [[key_gen(l), self.get_map(l.index, axes_order=axes_order)] for l in lev.find(target, unique=False)] if overlaps == 'max': # not efficient since it places all maps back into a single # ndarray... but well maps = np.array([x[1] for x in res]) maximums = np.argmax(maps, axis=0) overlaps = np.sum(maps != 0, axis=0) > 1 # now lets go and infiltrate maps: # and do silly loop since we will reassign # the entries possibly for i in xrange(len(res)): n, m = res[i] loosers = np.logical_and(overlaps, ~(maximums == i)) if len(loosers): # copy and modify m_new = m.copy() m_new[loosers] = 0 res[i][1] = m_new elif overlaps is None: pass else: raise ValueError, \ "Incorrect value of overlaps argument %s" % overlaps return dict(res)
class ColumnData(dict): """Read data that is stored in columns of text files. All read data is available via a dictionary-like interface. If column headers are available, the column names serve as dictionary keys. If no header exists an articfical key is generated: str(number_of_column). Splitting of text file lines is performed by the standard split() function (which gets passed the `sep` argument as separator string) and each element is converted into the desired datatype. Because data is read into a dictionary no two columns can have the same name in the header! Each column is stored as a list in the dictionary. """ def __init__(self, source, header=True, sep=None, headersep=None, dtype=float, skiplines=0): """Read data from file into a dictionary. Parameters ---------- source : str or dict If values is given as a string all data is read from the file and additonal keyword arguments can be sued to customize the read procedure. If a dictionary is passed a deepcopy is performed. header : bool or list of str Indicates whether the column names should be read from the first line (`header=True`). If `header=False` unique column names will be generated (see class docs). If `header` is a python list, it's content is used as column header names and its length has to match the number of columns in the file. sep : str or None Separator string. The actual meaning depends on the output format (see class docs). headersep : str or None Separator string used in the header. The actual meaning depends on the output format (see class docs). dtype : type or list(types) Desired datatype(s). Datatype per column get be specified by passing a list of types. skiplines : int Number of lines to skip at the beginning of the file. """ # init base class dict.__init__(self) # intialize with default self._header_order = None if isinstance(source, str): self._from_file(source, header=header, sep=sep, headersep=headersep, dtype=dtype, skiplines=skiplines) elif isinstance(source, dict): for k, v in source.iteritems(): self[k] = v # check data integrity self._check() else: raise ValueError, 'Unkown source for ColumnData [%r]' \ % type(source) # generate missing properties for each item in the header classdict = self.__class__.__dict__ for k in self.keys(): if not classdict.has_key(k): getter = "lambda self: self._get_attrib('%s')" % (k) # Sanitarize the key, substitute ' []' with '_' k_ = sub('[[\] ]', '_', k) # replace multipe _s k_ = sub('__+', '_', k_) # remove quotes k_ = sub('["\']', '', k_) if __debug__: debug( "IOH", "Registering property %s for ColumnData key %s" % (k_, k)) # make sure to import class directly into local namespace # otherwise following does not work for classes defined # elsewhere exec 'from %s import %s' % (self.__module__, self.__class__.__name__) exec "%s.%s = property(fget=%s)" % \ (self.__class__.__name__, k_, getter) # TODO!!! Check if it is safe actually here to rely on value of # k in lambda. May be it is treated as continuation and # some local space would override it???? #setattr(self.__class__, # k, # property(fget=lambda x: x._get_attrib("%s" % k))) # it seems to be error-prone due to continuation... __doc__ = enhanced_doc_string('ColumnData', locals()) ##REF: Name was automagically refactored def _get_attrib(self, key): """Return corresponding value if given key is known to current instance Is used for automatically added properties to the class. Raises ------ ValueError: If `key` is not known to given instance Returns ------- Value if `key` is known """ if self.has_key(key): return self[key] else: raise ValueError, "Instance %r has no data about %r" \ % (self, key) def __str__(self): s = self.__class__.__name__ if len(self.keys()) > 0: s += " %d rows, %d columns [" % \ (self.nrows, self.ncolumns) s += reduce(lambda x, y: x + " %s" % y, self.keys()) s += "]" return s def _check(self): """Performs some checks for data integrity. """ length = None for k in self.keys(): if length == None: length = len(self[k]) else: if not len(self[k]) == length: raise ValueError, "Data integrity lost. Columns do not " \ "have equal length." def _from_file(self, filename, header, sep, headersep, dtype, skiplines): """Loads column data from file -- clears object first. """ # make a clean table self.clear() file_ = open(filename, 'r') self._header_order = None [file_.readline() for x in range(skiplines)] """Simply skip some lines""" # make column names, either take header or generate if header == True: # read first line and split by 'sep' hdr = file_.readline().split(headersep) # remove bogus empty header titles hdr = [x for x in hdr if len(x.strip())] self._header_order = hdr elif isinstance(header, list): hdr = header else: hdr = [str(i) for i in xrange(len(file_.readline().split(sep)))] # reset file to not miss the first line file_.seek(0) [file_.readline() for x in range(skiplines)] # string in lists: one per column tbl = [[] for i in xrange(len(hdr))] # do per column dtypes if not isinstance(dtype, list): dtype = [dtype] * len(hdr) # parse line by line and feed into the lists for line in file_: # get rid of leading and trailing whitespace line = line.strip() # ignore empty lines and comment lines if not line or line.startswith('#'): continue l = line.split(sep) if not len(l) == len(hdr): raise RuntimeError, \ "Number of entries in line [%i] does not match number " \ "of columns in header [%i]." % (len(l), len(hdr)) for i, v in enumerate(l): if not dtype[i] is None: try: v = dtype[i](v) except ValueError: warning("Can't convert %r to desired datatype %r." % (v, dtype) + " Leaving original type") tbl[i].append(v) # check if not len(tbl) == len(hdr): raise RuntimeError, "Number of columns read from file does not " \ "match the number of header entries." # fill dict for i, v in enumerate(hdr): self[v] = tbl[i] def __iadd__(self, other): """Merge column data. """ # for all columns in the other object for k, v in other.iteritems(): if not self.has_key(k): raise ValueError, 'Unknown key [%r].' % (k, ) if not isinstance(v, list): raise ValueError, 'Can only merge list data, but got [%r].' \ % type(v) # now it seems to be ok # XXX check for datatype? self[k] += v # look for problems, like columns present in self, but not in other self._check() return self ##REF: Name was automagically refactored def select_samples(self, selection): """Return new ColumnData with selected samples""" data = copy.deepcopy(self) for k, v in data.iteritems(): data[k] = [v[x] for x in selection] data._check() return data @property def ncolumns(self): """Returns the number of columns. """ return len(self.keys()) def tofile(self, filename, header=True, header_order=None, sep=' '): """Write column data to a text file. Parameters ---------- filename : str Target filename header : bool, optional If `True` a column header is written, using the column keys. If `False` no header is written. header_order : None or list of str If it is a list of strings, they will be used instead of simply asking for the dictionary keys. However these strings must match the dictionary keys in number and identity. This argument type can be used to determine the order of the columns in the output file. The default value is `None`. In this case the columns will be in an arbitrary order. sep : str, optional String that is written as a separator between to data columns. """ # XXX do the try: except: dance file_ = open(filename, 'w') # write header if header_order == None: if self._header_order is None: col_hdr = self.keys() else: # use stored order + newly added keys at the last columns col_hdr = self._header_order + \ list(Set(self.keys()).difference( Set(self._header_order))) else: if not len(header_order) == self.ncolumns: raise ValueError, 'Header list does not match number of ' \ 'columns.' for k in header_order: if not self.has_key(k): raise ValueError, 'Unknown key [%r]' % (k, ) col_hdr = header_order if header == True: file_.write(sep.join(col_hdr) + '\n') # for all rows for r in xrange(self.nrows): # get attributes for all keys l = [str(self[k][r]) for k in col_hdr] # write to file with proper separator file_.write(sep.join(l) + '\n') file_.close() @property def nrows(self): """Returns the number of rows. """ # no data no rows (after Bob Marley) if not len(self.keys()): return 0 # otherwise first key is as good as any other else: return len(self[self.keys()[0]])
class ReferencesAtlas(PyMVPAAtlas): """ Atlas which provides references to the other atlases. Example: the atlas which has references to the closest points (closest Gray, etc) in another atlas. """ def __init__(self, distance=0, *args, **kwargs): """Initialize `ReferencesAtlas` """ PyMVPAAtlas.__init__(self, *args, **kwargs) # sanity checks if not ('reference-atlas' in XMLBasedAtlas._children_tags( self.header)): raise XMLAtlasException( "ReferencesAtlas must refer to a some other atlas") referenceAtlasName = self.header["reference-atlas"].text # uff -- another evil import but we better use the factory method from mvpa.atlases.warehouse import Atlas self.__referenceAtlas = Atlas( filename=reuse_absolute_path(self._filename, referenceAtlasName)) if self.__referenceAtlas.space != self.space or \ self.__referenceAtlas.space_flavor != self.space_flavor: raise XMLAtlasException( "Reference and original atlases should be in the same space") self.__referenceLevel = None self.set_distance(distance) __doc__ = enhanced_doc_string('ReferencesAtlas', locals(), PyMVPAAtlas) # number of levels must be of the referenced atlas due to # handling of that in __getitem__ #nlevels = property(fget=lambda self:self.__referenceAtlas.nlevels) ##REF: Name was automagically refactored def _get_nlevels_virtual(self): return self.__referenceAtlas.nlevels ##REF: Name was automagically refactored def set_reference_level(self, level): """ Set the level which will be queried """ if self._levels.has_key(level): self.__referenceLevel = self._levels[level] else: raise IndexError, \ "Unknown reference level %r. " % level + \ "Known are %r" % (self._levels.keys(), ) ##REF: Name was automagically refactored def label_voxel(self, c, levels=None): if self.__referenceLevel is None: warning("You did not provide what level to use " "for reference. Assigning 0th level -- '%s'" % (self._levels[0], )) self.set_reference_level(0) # return self.__referenceAtlas.label_voxel(c, levels) c = self._check_range(c) # obtain coordinates of the closest voxel cref = self._data[self.__referenceLevel.indexes, c[2], c[1], c[0]] dist = norm((cref - c) * self.voxdim) if __debug__: debug( 'ATL__', "Closest referenced point for %r is " "%r at distance %3.2f" % (c, cref, dist)) if (self.distance - dist) >= 1e-3: # neglect everything smaller result = self.__referenceAtlas.label_voxel(cref, levels) result['voxel_referenced'] = c result['distance'] = dist else: result = self.__referenceAtlas.label_voxel(c, levels) if __debug__: debug( 'ATL__', "Closest referenced point is " "further than desired distance %.2f" % self.distance) result['voxel_referenced'] = None result['distance'] = 0 return result ##REF: Name was automagically refactored def levels_listing(self): return self.__referenceAtlas.levels_listing() ##REF: Name was automagically refactored def _get_levels_virtual(self): return self.__referenceAtlas.levels ##REF: Name was automagically refactored def set_distance(self, distance): """Set desired maximal distance for the reference """ if distance < 0: raise ValueError("Distance should not be negative. " " Thus '%f' is not a legal value" % distance) if __debug__: debug('ATL__', "Setting maximal distance for queries to be %d" % distance) self.__distance = distance distance = property(fget=lambda self: self.__distance, fset=set_distance)
def __init__(cls, name, bases, dict): """ Parameters ---------- name : str Name of the class bases : iterable Base classes dict : dict Attributes. """ if __debug__: debug( "COLR", "AttributesCollector call for %s.%s, where bases=%s, dict=%s " \ % (cls, name, bases, dict)) super(AttributesCollector, cls).__init__(name, bases, dict) collections = {} for name, value in dict.iteritems(): if isinstance(value, IndexedCollectable): baseclassname = value.__class__.__name__ col = _known_collections[baseclassname][0] # XXX should we allow to throw exceptions here? if not collections.has_key(col): collections[col] = {} collections[col][name] = value # and assign name if not yet was set if value.name is None: value.name = name # !!! We do not keep copy of this attribute static in the class. # Due to below traversal of base classes, we should be # able to construct proper collections even in derived classes delattr(cls, name) # XXX can we first collect parent's ca and then populate with ours? # TODO for base in bases: if hasattr(base, "__metaclass__") and \ base.__metaclass__ == AttributesCollector: # TODO take care about overriding one from super class # for state in base.ca: # if state[0] = newcollections = base._collections_template if len(newcollections) == 0: continue if __debug__: # XXX RF: and "COLR" in debug.active: debug("COLR", "Collect collections %s for %s from %s" % (newcollections, cls, base)) for col, collection in newcollections.iteritems(): if collections.has_key(col): collections[col].update(collection) else: collections[col] = collection if __debug__: debug("COLR", "Creating ConditionalAttributesCollection template %s with collections %s" % (cls, collections.keys())) # if there is an explicit if hasattr(cls, "_ATTRIBUTE_COLLECTIONS"): for col in cls._ATTRIBUTE_COLLECTIONS: if not col in _col2class: raise ValueError, \ "Requested collection %s is unknown to collector" % \ col if not col in collections: collections[col] = None # TODO: check on conflict in names of Collections' items! since # otherwise even order is not definite since we use dict for # collections. # XXX should we switch to tuple? for col, colitems in collections.iteritems(): # so far we collected the collection items in a dict, but the new # API requires to pass a _list_ of collectables instead of a dict. # So, whenever there are items, we pass just the values of the dict. # There is no information last, since the keys of the dict are the # name attributes of each collectable in the list. if not colitems is None: collections[col] = _col2class[col](items=colitems.values()) else: collections[col] = _col2class[col]() setattr(cls, "_collections_template", collections) # # Expand documentation for the class based on the listed # parameters an if it is stateful # # TODO -- figure nice way on how to alter __init__ doc directly... textwrapper = TextWrapper(subsequent_indent=" ", initial_indent=" ", width=70) # Parameters paramsdoc = [] paramscols = [] for col in ('params', 'kernel_params'): if collections.has_key(col): paramscols.append(col) # lets at least sort the parameters for consistent output col_items = collections[col] iparams = [(v._instance_index, k) for k,v in col_items.iteritems()] iparams.sort() paramsdoc += [(col_items[iparam[1]].name, col_items[iparam[1]]._paramdoc()) for iparam in iparams] # Parameters collection could be taked hash of to decide if # any were changed? XXX may be not needed at all? setattr(cls, "_paramscols", paramscols) # States doc cadoc = "" if collections.has_key('ca'): paramsdoc += [ ('enable_ca', "enable_ca : None or list of str\n " "Names of the conditional attributes which should " "be enabled in addition\n to the default ones"), ('disable_ca', "disable_ca : None or list of str\n " "Names of the conditional attributes which should " "be disabled""")] if len(collections['ca']): cadoc += '\n'.join(['* ' + x for x in collections['ca'].listing]) cadoc += "\n\n(Conditional attributes enabled by default suffixed with `+`)" if __debug__: debug("COLR", "Assigning __cadoc to be %s" % cadoc) setattr(cls, "_cadoc", cadoc) if paramsdoc != "": if __debug__ and 'COLR' in debug.active: debug("COLR", "Assigning __paramsdoc to be %s" % paramsdoc) setattr(cls, "_paramsdoc", paramsdoc) if len(paramsdoc) or cadoc != "": cls.__doc__ = enhanced_doc_string(cls, *bases)
class Splitter(object): """Base class of dataset splitters. Each splitter should be initialized with all its necessary parameters. The final splitting is done running the splitter object on a certain Dataset via __call__(). This method has to be implemented like a generator, i.e. it has to return every possible split with a yield() call. Each split has to be returned as a sequence of Datasets. The properties of the splitted dataset may vary between implementations. It is possible to declare a sequence element as 'None'. Please note, that even if there is only one Dataset returned it has to be an element in a sequence and not just the Dataset object! """ _STRATEGIES = ('first', 'random', 'equidistant') _NPERLABEL_STR = ['equal', 'all'] def __init__(self, npertarget='all', nrunspersplit=1, permute_attr=None, count=None, strategy='equidistant', discard_boundary=None, attr='chunks', reverse=False, noslicing=False): """Initialize splitter base. Parameters ---------- npertarget : int or str (or list of them) or float Number of dataset samples per label to be included in each split. If given as a float, it must be in [0,1] range and would mean the ratio of selected samples per each label. Two special strings are recognized: 'all' uses all available samples (default) and 'equal' uses the maximum number of samples the can be provided by all of the classes. This value might be provided as a sequence length of which matches the number of datasets per split and indicates the configuration for the respective dataset in each split. nrunspersplit : int Number of times samples for each split are chosen. This is mostly useful if a subset of the available samples is used in each split and the subset is randomly selected for each run (see the `npertarget` argument). permute_attr : None or str If set to a string (e.g. 'targets'), the corresponding .sa of each generated dataset will be permuted on a per-chunk basis. count : None or int Desired number of splits to be output. It is limited by the number of splits possible for a given splitter (e.g. `OddEvenSplitter` can have only up to 2 splits). If None, all splits are output (default). strategy : str If `count` is not None, possible strategies are possible: 'first': First `count` splits are chosen; 'random': Random (without replacement) `count` splits are chosen; 'equidistant': Splits which are equidistant from each other. discard_boundary : None or int or sequence of int If not `None`, how many samples on the boundaries between parts of the split to discard in the training part. If int, then discarded in all parts. If a sequence, numbers to discard are given per part of the split. E.g. if splitter splits only into (training, testing) parts, then `discard_boundary=(2,0)` would instruct to discard 2 samples from training which are on the boundary with testing. attr : str Sample attribute used to determine splits. reverse : bool If True, the order of datasets in the split is reversed, e.g. instead of (training, testing), (training, testing) will be spit out noslicing : bool If True, dataset splitting is not done by slicing (causing shared data between source and split datasets) even if it would be possible. By default slicing is performed whenever possible to reduce the memory footprint. """ # pylint happyness block self.__npertarget = None self.__runspersplit = nrunspersplit self.__permute_attr = permute_attr self.__splitattr = attr self.__noslicing = noslicing self._reverse = reverse self.discard_boundary = discard_boundary # we don't check it, thus no reason to make it private. # someone might find it useful to change post creation # TODO utilize such (or similar) policy through out the code self.count = count """Number (max) of splits to output on call""" self._set_strategy(strategy) # pattern sampling status vars self.set_n_per_label(npertarget) __doc__ = enhanced_doc_string('Splitter', locals()) ##REF: Name was automagically refactored def _set_strategy(self, strategy): """Set strategy to select splits out from available """ strategy = strategy.lower() if not strategy in self._STRATEGIES: raise ValueError, "strategy is not known. Known are %s" \ % str(self._STRATEGIES) self.__strategy = strategy ##REF: Name was automagically refactored def set_n_per_label(self, value): """Set the number of samples per label in the split datasets. 'equal' sets sample size to highest possible number of samples that can be provided by each class. 'all' uses all available samples (default). """ if isinstance(value, basestring): if not value in self._NPERLABEL_STR: raise ValueError, "Unsupported value '%s' for npertarget." \ " Supported ones are %s or float or int" \ % (value, self._NPERLABEL_STR) self.__npertarget = value ##REF: Name was automagically refactored def _get_split_config(self, uniqueattr): """Return list with samples of 2nd dataset in a split. Each subclass has to implement this method. It gets a sequence with the unique attribute ids of a dataset and has to return a list of lists containing sample ids to split into the second dataset. """ raise NotImplementedError def __call__(self, dataset): """Splits the dataset. This method behaves like a generator. """ # local bindings to methods to gain some speedup ds_class = dataset.__class__ # for each split cfgs = self.splitcfg(dataset) n_cfgs = len(cfgs) # Finally split the data for isplit, split in enumerate(cfgs): # determine sample sizes if not operator.isSequenceType(self.__npertarget) \ or isinstance(self.__npertarget, str): npertargetsplit = [self.__npertarget] * len(split) else: npertargetsplit = self.__npertarget # get splitted datasets split_ds = self.split_dataset(dataset, split) # do multiple post-processing runs for this split for run in xrange(self.__runspersplit): # post-process all datasets finalized_datasets = [] for ds, npertarget in zip(split_ds, npertargetsplit): # Set flag of dataset either this was the last split # ??? per our discussion this might be the best # solution which would scale if we care about # thread-safety etc if ds is not None: ds_a = ds.a lastsplit = (isplit == n_cfgs - 1) if not ds_a.has_key('lastsplit'): # if not yet known -- add one ds_a['lastsplit'] = lastsplit else: # otherwise just assign a new value ds_a.lastsplit = lastsplit # permute the labels if self.__permute_attr is not None: permute_attr(ds, attr=self.__permute_attr, chunks_attr='chunks', col='sa') # select subset of samples if requested if npertarget == 'all' or ds is None: finalized_datasets.append(ds) else: # We need to select a subset of samples # TODO: move all this logic within random_sample # go for maximum possible number of samples provided # by each label in this dataset if npertarget == 'equal': # determine the min number of samples per class npl = np.array( get_nsamples_per_attr( ds, 'targets').values()).min() elif isinstance(npertarget, float) or ( operator.isSequenceType(npertarget) and len(npertarget) > 0 and isinstance(npertarget[0], float)): # determine number of samples per class and take # a ratio counts = np.array( get_nsamples_per_attr(ds, 'targets').values()) npl = (counts * npertarget).round().astype(int) else: npl = npertarget # finally select the patterns finalized_datasets.append(random_samples(ds, npl)) if self._reverse: yield finalized_datasets[::-1] else: yield finalized_datasets ##REF: Name was automagically refactored def split_dataset(self, dataset, specs): """Split a dataset by separating the samples where the configured sample attribute matches an element of `specs`. Parameters ---------- dataset : Dataset This is this source dataset. specs : sequence of sequences Contains ids of a sample attribute that shall be split into the another dataset. Returns ------- Tuple of splitted datasets. """ # collect the sample ids for each resulting dataset filters = [] none_specs = 0 cum_filter = None # Prepare discard_boundary discard_boundary = self.discard_boundary if isinstance(discard_boundary, int): if discard_boundary != 0: discard_boundary = (discard_boundary, ) * len(specs) else: discard_boundary = None splitattr_data = dataset.sa[self.__splitattr].value for spec in specs: if spec is None: filters.append(None) none_specs += 1 else: filter_ = np.array([ i in spec \ for i in splitattr_data], dtype='bool') filters.append(filter_) if cum_filter is None: cum_filter = filter_ else: cum_filter = np.logical_and(cum_filter, filter_) # need to turn possible Nones into proper ids sequences if none_specs > 1: raise ValueError, "Splitter cannot handle more than one `None` " \ "split definition." for i, filter_ in enumerate(filters): if filter_ is None: filters[i] = np.logical_not(cum_filter) # If it was told to discard samples on the boundary to the # other parts of the split if discard_boundary is not None: ndiscard = discard_boundary[i] if ndiscard != 0: # XXX sloppy implementation for now. It still # should not be the main reason for a slow-down of # the whole analysis ;) f, lenf = filters[i], len(filters[i]) f_pad = np.concatenate( ([True] * ndiscard, f, [True] * ndiscard)) for d in xrange(2 * ndiscard + 1): f = np.logical_and(f, f_pad[d:d + lenf]) filters[i] = f[:] # split data: return None if no samples are left # XXX: Maybe it should simply return an empty dataset instead, but # keeping it this way for now, to maintain current behavior split_datasets = [] for filter_ in filters: if (filter_ == False).all(): split_datasets.append(None) else: # check whether we can do slicing instead of advanced # indexing -- if we can split the dataset without causing # the data to be copied, its is quicker and leaner. # However, it only works if we have a contiguous chunk or # regular step sizes for the samples to be split split_datasets.append(dataset[self._filter2slice(filter_)]) return split_datasets def _filter2slice(self, bf): if self.__noslicing: # we are not allowed to help :-( return bf # the filter should be a boolean array if not len(bf): raise ValueError("'%s' recieved an empty filter. This is a " "bug." % self.__class__.__name__) # get indices of non-zero filter elements idx = bf.nonzero()[0] idx_start = idx[0] idx_end = idx[-1] + 1 idx_step = None if len(idx) > 1: # we need to figure out if there is a regular step-size # between elements stepsizes = np.unique(idx[1:] - idx[:-1]) if len(stepsizes) > 1: # multiple step-sizes -> slicing is not possible -> return # orginal filter return bf else: idx_step = stepsizes[0] sl = slice(idx_start, idx_end, idx_step) if __debug__: debug( "SPL", "Splitting by basic slicing is possible and permitted " "(%s)." % sl) return sl def __str__(self): """String summary over the object """ return \ "SplitterConfig: npertarget:%s runs-per-split:%d permute_attr:%s" \ % (self.__npertarget, self.__runspersplit, self.__permute_attr) def splitcfg(self, dataset): """Return splitcfg for a given dataset""" cfgs = self._get_split_config(dataset.sa[self.__splitattr].unique) # Select just some splits if desired count, n_cfgs = self.count, len(cfgs) # further makes sense only iff count < n_cfgs, # otherwise all strategies are equivalent if count is not None and count < n_cfgs: if count < 1: # we can only wish a good luck return [] strategy = self.strategy if strategy == 'first': cfgs = cfgs[:count] elif strategy in ['equidistant', 'random']: if strategy == 'equidistant': # figure out what step is needed to # accommodate the `count` number step = float(n_cfgs) / count assert (step >= 1.0) indexes = [int(round(step * i)) for i in xrange(count)] elif strategy == 'random': indexes = np.random.permutation(range(n_cfgs))[:count] # doesn't matter much but lets keep them in the original # order at least indexes.sort() else: # who said that I am paranoid? raise RuntimeError, "Really should not happen" if __debug__: debug( "SPL", "For %s strategy selected %s splits " "from %d total" % (strategy, indexes, n_cfgs)) cfgs = [cfgs[i] for i in indexes] return cfgs strategy = property(fget=lambda self: self.__strategy, fset=_set_strategy) splitattr = property(fget=lambda self: self.__splitattr) permute_attr = property(fget=lambda self: self.__permute_attr) npertarget = property(fget=lambda self: self.__npertarget)
def __init__(cls, name, bases, dict): """ Parameters ---------- name : str Name of the class bases : iterable Base classes dict : dict Attributes. """ if __debug__: debug( "COLR", "AttributesCollector call for %s.%s, where bases=%s, dict=%s " \ % (cls, name, bases, dict)) super(AttributesCollector, cls).__init__(name, bases, dict) collections = {} for name, value in dict.iteritems(): if isinstance(value, IndexedCollectable): baseclassname = value.__class__.__name__ col = _known_collections[baseclassname][0] # XXX should we allow to throw exceptions here? if not collections.has_key(col): collections[col] = {} collections[col][name] = value # and assign name if not yet was set if value.name is None: value.name = name # !!! We do not keep copy of this attribute static in the class. # Due to below traversal of base classes, we should be # able to construct proper collections even in derived classes delattr(cls, name) # XXX can we first collect parent's ca and then populate with ours? # TODO for base in bases: if hasattr(base, "__metaclass__") and \ base.__metaclass__ == AttributesCollector: # TODO take care about overriding one from super class # for state in base.ca: # if state[0] = newcollections = base._collections_template if len(newcollections) == 0: continue if __debug__: # XXX RF: and "COLR" in debug.active: debug( "COLR", "Collect collections %s for %s from %s" % (newcollections, cls, base)) for col, collection in newcollections.iteritems(): if collections.has_key(col): collections[col].update(collection) else: collections[col] = collection if __debug__: debug( "COLR", "Creating ConditionalAttributesCollection template %s with collections %s" % (cls, collections.keys())) # if there is an explicit if hasattr(cls, "_ATTRIBUTE_COLLECTIONS"): for col in cls._ATTRIBUTE_COLLECTIONS: if not col in _col2class: raise ValueError, \ "Requested collection %s is unknown to collector" % \ col if not col in collections: collections[col] = None # TODO: check on conflict in names of Collections' items! since # otherwise even order is not definite since we use dict for # collections. # XXX should we switch to tuple? for col, colitems in collections.iteritems(): # so far we collected the collection items in a dict, but the new # API requires to pass a _list_ of collectables instead of a dict. # So, whenever there are items, we pass just the values of the dict. # There is no information last, since the keys of the dict are the # name attributes of each collectable in the list. if not colitems is None: collections[col] = _col2class[col](items=colitems.values()) else: collections[col] = _col2class[col]() setattr(cls, "_collections_template", collections) # # Expand documentation for the class based on the listed # parameters an if it is stateful # # TODO -- figure nice way on how to alter __init__ doc directly... textwrapper = TextWrapper(subsequent_indent=" ", initial_indent=" ", width=70) # Parameters paramsdoc = [] paramscols = [] for col in ('params', 'kernel_params'): if collections.has_key(col): paramscols.append(col) # lets at least sort the parameters for consistent output col_items = collections[col] iparams = [(v._instance_index, k) for k, v in col_items.iteritems()] iparams.sort() paramsdoc += [(col_items[iparam[1]].name, col_items[iparam[1]]._paramdoc()) for iparam in iparams] # Parameters collection could be taked hash of to decide if # any were changed? XXX may be not needed at all? setattr(cls, "_paramscols", paramscols) # States doc cadoc = "" if collections.has_key('ca'): paramsdoc += [('enable_ca', "enable_ca : None or list of str\n " "Names of the conditional attributes which should " "be enabled in addition\n to the default ones"), ('disable_ca', "disable_ca : None or list of str\n " "Names of the conditional attributes which should " "be disabled" "")] if len(collections['ca']): cadoc += '\n'.join( ['* ' + x for x in collections['ca'].listing]) cadoc += "\n\n(Conditional attributes enabled by default suffixed with `+`)" if __debug__: debug("COLR", "Assigning __cadoc to be %s" % cadoc) setattr(cls, "_cadoc", cadoc) if paramsdoc != "": if __debug__ and 'COLR' in debug.active: debug("COLR", "Assigning __paramsdoc to be %s" % paramsdoc) setattr(cls, "_paramsdoc", paramsdoc) if len(paramsdoc) or cadoc != "": cls.__doc__ = enhanced_doc_string(cls, *bases)
class DatasetMeasure(ClassWithCollections): """A measure computed from a `Dataset` All dataset measures support arbitrary transformation of the measure after it has been computed. Transformation are done by processing the measure with a functor that is specified via the `transformer` keyword argument of the constructor. Upon request, the raw measure (before transformations are applied) is stored in the `raw_results` conditional attribute. Additionally all dataset measures support the estimation of the probabilit(y,ies) of a measure under some distribution. Typically this will be the NULL distribution (no signal), that can be estimated with permutation tests. If a distribution estimator instance is passed to the `null_dist` keyword argument of the constructor the respective probabilities are automatically computed and stored in the `null_prob` conditional attribute. Notes ----- For developers: All subclasses shall get all necessary parameters via their constructor, so it is possible to get the same type of measure for multiple datasets by passing them to the __call__() method successively. """ raw_results = ConditionalAttribute(enabled=False, doc="Computed results before applying any " + "transformation algorithm") null_prob = ConditionalAttribute(enabled=True) """Stores the probability of a measure under the NULL hypothesis""" null_t = ConditionalAttribute(enabled=False) """Stores the t-score corresponding to null_prob under assumption of Normal distribution""" def __init__(self, postproc=None, null_dist=None, **kwargs): """Does nothing special. Parameters ---------- postproc : Mapper instance Mapper to perform post-processing of results. This mapper is applied in `__call__()` to perform a final processing step on the to be returned dataset measure. If None, nothing is done. null_dist : instance of distribution estimator The estimated distribution is used to assign a probability for a certain value of the computed measure. """ ClassWithCollections.__init__(self, **kwargs) self.__postproc = postproc """Functor to be called in return statement of all subclass __call__() methods.""" null_dist_ = auto_null_dist(null_dist) if __debug__: debug('SA', 'Assigning null_dist %s whenever original given was %s' % (null_dist_, null_dist)) self.__null_dist = null_dist_ __doc__ = enhanced_doc_string('DatasetMeasure', locals(), ClassWithCollections) def __call__(self, dataset): """Compute measure on a given `Dataset`. Each implementation has to handle a single arguments: the source dataset. Returns the computed measure in some iterable (list-like) container applying a post-processing mapper if such is defined. """ result = self._call(dataset) result = self._postcall(dataset, result) # XXX Remove when "sensitivity-return-dataset" transition is done if __debug__ \ and not isinstance(result, AttrDataset) \ and not len(result.shape) == 1: warning("Postprocessing of '%s' doesn't return a Dataset, or " "1D-array (got: '%s')." % (self.__class__.__name__, result)) return result def _call(self, dataset): """Actually compute measure on a given `Dataset`. Each implementation has to handle a single arguments: the source dataset. Returns the computed measure in some iterable (list-like) container. """ raise NotImplemented def _postcall(self, dataset, result): """Some postprocessing on the result """ self.ca.raw_results = result # post-processing if not self.__postproc is None: if __debug__: debug("SA_", "Applying mapper %s" % self.__postproc) result = self.__postproc.forward(result) # estimate the NULL distribution when functor is given if not self.__null_dist is None: if __debug__: debug("SA_", "Estimating NULL distribution using %s" % self.__null_dist) # we need a matching datameasure instance, but we have to disable # the estimation of the null distribution in that child to prevent # infinite looping. measure = copy.copy(self) measure.__null_dist = None self.__null_dist.fit(measure, dataset) if self.ca.is_enabled('null_t'): # get probability under NULL hyp, but also request # either it belong to the right tail null_prob, null_right_tail = \ self.__null_dist.p(result, return_tails=True) self.ca.null_prob = null_prob externals.exists('scipy', raise_=True) from scipy.stats import norm # TODO: following logic should appear in NullDist, # not here tail = self.null_dist.tail if tail == 'left': acdf = np.abs(null_prob) elif tail == 'right': acdf = 1.0 - np.abs(null_prob) elif tail in ['any', 'both']: acdf = 1.0 - np.clip(np.abs(null_prob), 0, 0.5) else: raise RuntimeError, 'Unhandled tail %s' % tail # We need to clip to avoid non-informative inf's ;-) # that happens due to lack of precision in mantissa # which is 11 bits in double. We could clip values # around 0 at as low as 1e-100 (correspond to z~=21), # but for consistency lets clip at 1e-16 which leads # to distinguishable value around p=1 and max z=8.2. # Should be sufficient range of z-values ;-) clip = 1e-16 null_t = norm.ppf(np.clip(acdf, clip, 1.0 - clip)) # assure that we deal with arrays: null_t = np.array(null_t, ndmin=1, copy=False) null_t[~null_right_tail] *= -1.0 # revert sign for negatives self.ca.null_t = null_t # store else: # get probability of result under NULL hypothesis if available # and don't request tail information self.ca.null_prob = self.__null_dist.p(result) return result def __repr__(self, prefixes=[]): """String representation of a `DatasetMeasure` Includes only arguments which differ from default ones """ prefixes = prefixes[:] if self.__postproc is not None: prefixes.append("postproc=%s" % self.__postproc) if self.__null_dist is not None: prefixes.append("null_dist=%s" % self.__null_dist) return super(DatasetMeasure, self).__repr__(prefixes=prefixes) def untrain(self): """'Untraining' Measure Some derived classes might used classifiers, so we need to untrain those """ pass @property def null_dist(self): """Return Null Distribution estimator""" return self.__null_dist @property def postproc(self): """Return mapper""" return self.__postproc