Exemple #1
0
    def forward(self, data):
        """Map data from input to output space.

        Parameters
        ----------
        data : Dataset-like, (at least 2D)-array-like
          Typically this is a `Dataset`, but it might also be a plain data
          array, or even something completely different(TM) that is supported
          by a subclass' implementation. If such an object is Dataset-like it
          is handled by a dedicated method that also transforms dataset
          attributes if necessary. If an array-like is passed, it has to be
          at least two-dimensional, with the first axis separating samples
          or observations. For single samples `forward1()` might be more
          appropriate.
        """
        if is_datasetlike(data):
            return self._forward_dataset(data)
        else:
            if __debug__:
                if hasattr(data, 'ndim') and data.ndim < 2:
                    raise ValueError(
                        'Mapper.forward() only support mapping of data with '
                        'at least two dimensions, where the first axis '
                        'separates samples/observations. Consider using '
                        'Mapper.forward1() instead.')
            return self._forward_data(data)
Exemple #2
0
    def forward(self, data):
        """Map data from input to output space.

        Parameters
        ----------
        data : Dataset-like, (at least 2D)-array-like
          Typically this is a `Dataset`, but it might also be a plain data
          array, or even something completely different(TM) that is supported
          by a subclass' implementation. If such an object is Dataset-like it
          is handled by a dedicated method that also transforms dataset
          attributes if necessary. If an array-like is passed, it has to be
          at least two-dimensional, with the first axis separating samples
          or observations. For single samples `forward1()` might be more
          appropriate.
        """
        if is_datasetlike(data):
            if __debug__:
                debug('MAP', "Forward-map %s-shaped dataset through '%s'."
                        % (data.shape, self))
            return self._forward_dataset(data)
        else:
            if hasattr(data, 'ndim') and data.ndim < 2:
                raise ValueError(
                    'Mapper.forward() only support mapping of data with '
                    'at least two dimensions, where the first axis '
                    'separates samples/observations. Consider using '
                    'Mapper.forward1() instead.')
            if __debug__:
                debug('MAP', "Forward-map data through '%s'." % (self))
            return self._forward_data(data)
Exemple #3
0
    def compute(self, ds1, ds2=None):
        """Generic computation of any kernel

        Assumptions:

         - ds1, ds2 are either datasets or arrays,
         - presumably 2D (not checked neither enforced here
         - _compute takes ndarrays. If your kernel needs datasets,
           override compute
        """
        if is_datasetlike(ds1):
            ds1 = ds1.samples
        if ds2 is None:
            ds2 = ds1
        elif is_datasetlike(ds2):
            ds2 = ds2.samples
        # TODO: assure 2D shape
        self._compute(ds1, ds2)
Exemple #4
0
    def compute(self, ds1, ds2=None):
        """Generic computation of any kernel

        Assumptions:

         - ds1, ds2 are either datasets or arrays,
         - presumably 2D (not checked neither enforced here
         - _compute takes ndarrays. If your kernel needs datasets,
           override compute
        """
        if is_datasetlike(ds1):
            ds1 = ds1.samples
        if ds2 is None:
            ds2 = ds1
        elif is_datasetlike(ds2):
            ds2 = ds2.samples
        # TODO: assure 2D shape
        self._compute(ds1, ds2)
Exemple #5
0
def hstack(datasets):
    """Stacks datasets horizontally (appending features).

    Sample attribute collections are merged incrementally, attribute with
    identical keys overwriting previous ones in the stacked dataset. All
    datasets must have an identical set of feature attributes (matching keys,
    not values), otherwise a ValueError will be raised.
    No dataset attributes from any source dataset will be transferred into the
    stacked dataset.

    Parameters
    ----------
    datasets : tuple
      Sequence of datasets to be stacked.

    Returns
    -------
    AttrDataset (or respective subclass)
    """
    #
    # XXX Use CombinedMapper in here whenever it comes back
    #

    # fall back to numpy if it is not a dataset
    if not is_datasetlike(datasets[0]):
        # we might get a list of 1Ds that would yield wrong results when
        # turned into a dict (would run along samples-axis)
        return AttrDataset(np.atleast_2d(np.hstack(datasets)))

    if __debug__:
        target = sorted(datasets[0].fa.keys())
        if not np.all([sorted(ds.fa.keys()) == target for ds in datasets]):
            raise ValueError("Feature attributes collections of to be stacked "
                             "datasets have varying attributes.")
    # will puke if not equal number of samples
    stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=1)

    stacked_fa = {}
    for attr in datasets[0].fa:
        stacked_fa[attr] = np.concatenate(
            [ds.fa[attr].value for ds in datasets], axis=0)
    # create the dataset
    merged = datasets[0].__class__(stacked_samp, fa=stacked_fa)

    for ds in datasets:
        merged.sa.update(ds.sa)

    return merged
Exemple #6
0
    def train(self, ds):
        """
        The default implementation calls ``_pretrain()``, ``_train()``, and
        finally ``_posttrain()``.

        Parameters
        ----------
        ds: Dataset
          Training dataset.

        Returns
        -------
        None
        """
        got_ds = is_datasetlike(ds)

        # TODO remove first condition if all Learners get only datasets
        if got_ds and (ds.nfeatures == 0 or len(ds) == 0):
            raise DegenerateInputError("Cannot train classifier on degenerate data %s" % ds)
        if __debug__:
            debug("LRN", "Training learner %(lrn)s on dataset %(dataset)s", msgargs={"lrn": self, "dataset": ds})

        self._pretrain(ds)

        # remember the time when started training
        t0 = time.time()

        if got_ds:
            # things might have happened during pretraining
            if ds.nfeatures > 0:
                result = self._train(ds)
            else:
                warning("Trying to train on dataset with no features present")
                if __debug__:
                    debug("LRN", "No features present for training, no actual training " "is called")
                result = None
        else:
            # in this case we claim to have no idea and simply try to train
            result = self._train(ds)

        # store timing
        self.ca.training_time = time.time() - t0

        # and post-proc
        result = self._posttrain(ds)

        # finally flag as trained
        self._set_trained()
Exemple #7
0
def hstack(datasets):
    """Stacks datasets horizontally (appending features).

    Sample attribute collections are merged incrementally, attribute with
    identical keys overwriting previous ones in the stacked dataset. All
    datasets must have an identical set of feature attributes (matching keys,
    not values), otherwise a ValueError will be raised.
    No dataset attributes from any source dataset will be transferred into the
    stacked dataset.

    Parameters
    ----------
    datasets : tuple
      Sequence of datasets to be stacked.

    Returns
    -------
    AttrDataset (or respective subclass)
    """
    #
    # XXX Use CombinedMapper in here whenever it comes back
    #

    # fall back to numpy if it is not a dataset
    if not is_datasetlike(datasets[0]):
        # we might get a list of 1Ds that would yield wrong results when
        # turned into a dict (would run along samples-axis)
        return AttrDataset(np.atleast_2d(np.hstack(datasets)))

    if __debug__:
        target = sorted(datasets[0].fa.keys())
        if not np.all([sorted(ds.fa.keys()) == target for ds in datasets]):
            raise ValueError("Feature attributes collections of to be stacked "
                             "datasets have varying attributes.")
    # will puke if not equal number of samples
    stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=1)

    stacked_fa = {}
    for attr in datasets[0].fa:
        stacked_fa[attr] = np.concatenate(
            [ds.fa[attr].value for ds in datasets], axis=0)
    # create the dataset
    merged = datasets[0].__class__(stacked_samp, fa=stacked_fa)

    for ds in datasets:
        merged.sa.update(ds.sa)

    return merged
Exemple #8
0
    def reverse(self, data):
        """Reverse-map data from output back into input space.

        Parameters
        ----------
        data : Dataset-like, anything
          Typically this is a `Dataset`, but it might also be a plain data
          array, or even something completely different(TM) that is supported
          by a subclass' implementation. If such an object is Dataset-like it
          is handled by a dedicated method that also transforms dataset
          attributes if necessary.
        """
        if is_datasetlike(data):
            return self._reverse_dataset(data)
        else:
            return self._reverse_data(data)
Exemple #9
0
    def reverse(self, data):
        """Reverse-map data from output back into input space.

        Parameters
        ----------
        data : Dataset-like, anything
          Typically this is a `Dataset`, but it might also be a plain data
          array, or even something completely different(TM) that is supported
          by a subclass' implementation. If such an object is Dataset-like it
          is handled by a dedicated method that also transforms dataset
          attributes if necessary.
        """
        if is_datasetlike(data):
            return self._reverse_dataset(data)
        else:
            return self._reverse_data(data)
Exemple #10
0
def vstack(datasets):
    """Stacks datasets vertically (appending samples).

    Feature attribute collections are merged incrementally, attribute with
    identical keys overwriting previous ones in the stacked dataset. All
    datasets must have an identical set of sample attributes (matching keys,
    not values), otherwise a ValueError will be raised.
    No dataset attributes from any source dataset will be transferred into the
    stacked dataset. If all input dataset have common dataset attributes that
    are also valid for the stacked dataset, they can be moved into the output
    dataset like this::

      ds_merged = vstack((ds1, ds2, ds3))
      ds_merged.a.update(ds1.a)

    Parameters
    ----------
    datasets : tuple
      Sequence of datasets to be stacked.

    Returns
    -------
    AttrDataset (or respective subclass)
    """
    # fall back to numpy if it is not a dataset
    if not is_datasetlike(datasets[0]):
        return AttrDataset(np.vstack(datasets))

    if __debug__:
        target = sorted(datasets[0].sa.keys())
        if not np.all([sorted(ds.sa.keys()) == target for ds in datasets]):
            raise ValueError("Sample attributes collections of to be stacked "
                             "datasets have varying attributes.")
    # will puke if not equal number of features
    stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=0)

    stacked_sa = {}
    for attr in datasets[0].sa:
        stacked_sa[attr] = np.concatenate(
            [ds.sa[attr].value for ds in datasets], axis=0)
    # create the dataset
    merged = datasets[0].__class__(stacked_samp, sa=stacked_sa)

    for ds in datasets:
        merged.fa.update(ds.fa)

    return merged
Exemple #11
0
def vstack(datasets):
    """Stacks datasets vertically (appending samples).

    Feature attribute collections are merged incrementally, attribute with
    identical keys overwriting previous ones in the stacked dataset. All
    datasets must have an identical set of sample attributes (matching keys,
    not values), otherwise a ValueError will be raised.
    No dataset attributes from any source dataset will be transferred into the
    stacked dataset. If all input dataset have common dataset attributes that
    are also valid for the stacked dataset, they can be moved into the output
    dataset like this::

      ds_merged = vstack((ds1, ds2, ds3))
      ds_merged.a.update(ds1.a)

    Parameters
    ----------
    datasets : tuple
      Sequence of datasets to be stacked.

    Returns
    -------
    AttrDataset (or respective subclass)
    """
    # fall back to numpy if it is not a dataset
    if not is_datasetlike(datasets[0]):
        return AttrDataset(np.vstack(datasets))

    if __debug__:
        target = sorted(datasets[0].sa.keys())
        if not np.all([sorted(ds.sa.keys()) == target for ds in datasets]):
            raise ValueError("Sample attributes collections of to be stacked "
                             "datasets have varying attributes.")
    # will puke if not equal number of features
    stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=0)

    stacked_sa = {}
    for attr in datasets[0].sa:
        stacked_sa[attr] = np.concatenate(
            [ds.sa[attr].value for ds in datasets], axis=0)
    # create the dataset
    merged = datasets[0].__class__(stacked_samp, sa=stacked_sa)

    for ds in datasets:
        merged.fa.update(ds.fa)

    return merged
Exemple #12
0
    def p(self, x, return_tails=False, **kwargs):
        """Returns the p-value for values of `x`.
        Returned values are determined left, right, or from any tail
        depending on the constructor setting.

        In case a `FeaturewiseMeasure` was used to estimate the
        distribution the method returns an array. In that case `x` can be
        a scalar value or an array of a matching shape.
        """
        peas = _pvalue(x, self.cdf, self.__tail, return_tails=return_tails,
                       **kwargs)
        if is_datasetlike(x):
            # return the p-values in a dataset as well and assign the input
            # dataset attributes to the return dataset too
            pds = x.copy(deep=False)
            if return_tails:
                pds.samples = peas[0]
                return pds, peas[1]
            else:
                pds.samples = peas
                return pds
        return peas
Exemple #13
0
 def wrap_samples(obj, data, *args, **kwargs):
     if is_datasetlike(data):
         return fx(obj, data, *args, **kwargs)
     else:
         return fx(obj, Dataset(data), *args, **kwargs)
Exemple #14
0
 def wrap_samples(obj, data, *args, **kwargs):
     if is_datasetlike(data):
         return fx(obj, data, *args, **kwargs)
     else:
         return fx(obj, Dataset(data), *args, **kwargs)
Exemple #15
0
def test_from_wizard():
    samples = np.arange(12).reshape((4, 3)).view(myarray)
    labels = range(4)
    chunks = [1, 1, 2, 2]

    ds = Dataset(samples, sa={'targets': labels, 'chunks': chunks})
    ds.init_origids('both')
    first = ds.sa.origids
    # now do again and check that they get regenerated
    ds.init_origids('both')
    assert_false(first is ds.sa.origids)
    assert_array_equal(first, ds.sa.origids)

    ok_(is_datasetlike(ds))
    ok_(not is_datasetlike(labels))

    # array subclass survives
    ok_(isinstance(ds.samples, myarray))

    ## XXX stuff that needs thought:

    # ds.sa (empty) has this in the public namespace:
    #   add, get, getvalue, has_key, is_set, items, listing, name, names
    #   owner, remove, reset, setvalue, which_set
    # maybe we need some form of leightweightCollection?

    assert_array_equal(ds.samples, samples)
    assert_array_equal(ds.sa.targets, labels)
    assert_array_equal(ds.sa.chunks, chunks)

    # same should work for shortcuts
    assert_array_equal(ds.targets, labels)
    assert_array_equal(ds.chunks, chunks)

    ok_(sorted(ds.sa.keys()) == ['chunks', 'origids', 'targets'])
    ok_(sorted(ds.fa.keys()) == ['origids'])
    # add some more
    ds.a['random'] = 'blurb'

    # check stripping attributes from a copy
    cds = ds.copy() # full copy
    ok_(sorted(cds.sa.keys()) == ['chunks', 'origids', 'targets'])
    ok_(sorted(cds.fa.keys()) == ['origids'])
    ok_(sorted(cds.a.keys()) == ['random'])
    cds = ds.copy(sa=[], fa=[], a=[]) # plain copy
    ok_(cds.sa.keys() == [])
    ok_(cds.fa.keys() == [])
    ok_(cds.a.keys() == [])
    cds = ds.copy(sa=['targets'], fa=None, a=['random']) # partial copy
    ok_(cds.sa.keys() == ['targets'])
    ok_(cds.fa.keys() == ['origids'])
    ok_(cds.a.keys() == ['random'])

    # there is not necessarily a mapper present
    ok_(not ds.a.has_key('mapper'))

    # has to complain about misshaped samples attributes
    assert_raises(ValueError, Dataset.from_wizard, samples, labels + labels)

    # check that we actually have attributes of the expected type
    ok_(isinstance(ds.sa['targets'], ArrayCollectable))

    # the dataset will take care of not adding stupid stuff
    assert_raises(ValueError, ds.sa.__setitem__, 'stupid', np.arange(3))
    assert_raises(ValueError, ds.fa.__setitem__, 'stupid', np.arange(4))
    # or change proper attributes to stupid shapes
    try:
        ds.sa.targets = np.arange(3)
    except ValueError:
        pass
    else:
        ok_(False, msg="Assigning value with improper shape to attribute "
                       "did not raise exception.")