Exemple #1
0
 def __init__(self,
              count=None,
              selection_strategy='equidistant',
              attr='chunks',
              space='partitions',
              **kwargs):
     """
     Parameters
     ----------
     count : None or int
       Desired number of splits to be output. It is limited by the
       number of splits possible for a given splitter
       (e.g. `OddEvenSplitter` can have only up to 2 splits). If None,
       all splits are output (default).
     selection_strategy : str
       If `count` is not None, possible strategies are possible:
       'first': First `count` splits are chosen;
       'random': Random (without replacement) `count` splits are chosen;
       'equidistant': Splits which are equidistant from each other.
     attr : str
       Sample attribute used to determine splits.
     space : str
       Name of the to be created sample attribute defining the partitions.
       In addition, a dataset attribute named '`space`_set' will be added
       to each output dataset, indicating the number of the partition set
       it corresponds to.
     """
     Node.__init__(self, space=space, **kwargs)
     # pylint happyness block
     self.__splitattr = attr
     # we don't check it, thus no reason to make it private.
     # someone might find it useful to change post creation
     # TODO utilize such (or similar) policy through out the code
     self.count = count
     self._set_selection_strategy(selection_strategy)
Exemple #2
0
    def __init__(self, attr, count=1, limit=None, assure=False, **kwargs):
        """
        Parameters
        ----------
        attr : str or list(str)
          Name of the to-be-permuted attribute. This can also be a list of
          attribute names, in which case the *identical* shuffling is applied to
          all listed attributes.
        count : int
          Number of permutations to be yielded by .generate()
        limit : None or str or dict
          If ``None`` all attribute values will be permuted. If an single
          attribute name is given, its unique values will be used to define
          chunks of data that are permuted individually (i.e. no attributed
          values will be replaced across chunks). Finally, if a dictionary is
          provided, its keys define attribute names and its values (single value
          or sequence thereof) attribute value, where all key-value combinations
          across all given items define a "selection" of to-be-permuted samples
          or features.
        assure : bool
          If set, by-chance non-permutations will be prevented, i.e. it is
          checked that at least two items change their position. Since this
          check adds a runtime penalty it is off by default.
        """
        Node.__init__(self, **kwargs)
        self._pattr = attr

        self.count = count
        self._limit = limit
        self._pcfg = None
        self._assure_permute = assure
 def __init__(self,
              amount='equal',
              attr='targets',
              count=1,
              limit='chunks',
              apply_selection=False,
              include_offlimit=False,
              space='balanced_set',
              rng=None,
              **kwargs):
     """
     Parameters
     ----------
     amount : {'equal'} or int or float
       Specify the amount of elements to be selected (within the current
       ``limit``). The amount can be given as an integer value corresponding
       to the absolute number of elements per unique attribute (see ``attr``)
       value, as a float corresponding to the fraction of elements, or with
       the keyword 'equal'. In the latter case the number of to be selected
       elements is determined by the least number of available elements for
       any given unique attribute value within the current limit.
     attr : str
       Dataset attribute whose unique values define element classes that are
       to be balanced in number.
     count : int
       How many iterations to perform on ``generate()``.
     limit : None or str or dict
       If ``None`` the whole dataset is considered as one. If a single
       attribute name is given, its unique values will be used to define
       chunks of data that are balanced individually. Finally, if a
       dictionary is provided, its keys define attribute names and its values
       (single value or sequence thereof) attribute value, where all
       key-value combinations across all given items define a "selection" of
       to-be-balanced samples or features.
     apply_selection : bool
       Flag whether the balanced selection shall be applied, i.e. the output
       dataset only contains selected elements. If False, the selection is
       instead added as an attribute that merely marks selected elements (see
       ``space`` argument).
     include_offlimit : bool
       If True, all samples that were off limit (i.e. not included in the
       balancing input are included in the balanced selection. If False
       (default) they are excluded.
     space : str
       Name of the selection marker attribute in the output dataset that is
       created if the balanced selection is not applied to the output dataset
       (see ``apply_selection`` argument).
     rng : int or RandomState, optional
       Integer to seed a new RandomState upon each call, or instance of the
       numpy.random.RandomState to be reused across calls. If None, the
       numpy.random singleton would be used
     """
     Node.__init__(self, space=space, **kwargs)
     self._amount = amount
     self._attr = attr
     self.count = count
     self._limit = limit
     self._include_offlimit = include_offlimit
     self._apply_selection = apply_selection
     self._rng = rng
Exemple #4
0
 def __init__(self,
              count=None,
              selection_strategy='equidistant',
              attr='chunks',
              space='partitions',
              **kwargs):
     """
     Parameters
     ----------
     count : None or int
       Desired number of splits to be output. It is limited by the
       number of splits possible for a given splitter
       (e.g. `OddEvenSplitter` can have only up to 2 splits). If None,
       all splits are output (default).
     selection_strategy : str
       If `count` is not None, possible strategies are possible:
       'first': First `count` splits are chosen;
       'random': Random (without replacement) `count` splits are chosen;
       'equidistant': Splits which are equidistant from each other.
     attr : str
       Sample attribute used to determine splits.
     space : str
       Name of the to be created sample attribute defining the partitions.
       In addition, a dataset attribute named '``space``\_set' will be added
       to each output dataset, indicating the number of the partition set
       it corresponds to.
     """
     Node.__init__(self, space=space, **kwargs)
     # pylint happyness block
     self.__attr = attr
     # we don't check it, thus no reason to make it private.
     # someone might find it useful to change post creation
     # TODO utilize such (or similar) policy through out the code
     self.count = count
     self._set_selection_strategy(selection_strategy)
Exemple #5
0
 def __init__(self,
              amount='equal',
              attr='targets',
              count=1,
              limit='chunks',
              apply_selection=False,
              include_offlimit=False,
              space='balanced_set',
              rng=None,
              **kwargs):
     """
     Parameters
     ----------
     amount : {'equal'} or int or float
       Specify the amount of elements to be selected (within the current
       ``limit``). The amount can be given as an integer value corresponding
       to the absolute number of elements per unique attribute (see ``attr``)
       value, as a float corresponding to the fraction of elements, or with
       the keyword 'equal'. In the latter case the number of to be selected
       elements is determined by the least number of available elements for
       any given unique attribute value within the current limit.
     attr : str
       Dataset attribute whose unique values define element classes that are
       to be balanced in number.
     count : int
       How many iterations to perform on ``generate()``.
     limit : None or str or dict
       If ``None`` the whole dataset is considered as one. If a single
       attribute name is given, its unique values will be used to define
       chunks of data that are balanced individually. Finally, if a
       dictionary is provided, its keys define attribute names and its values
       (single value or sequence thereof) attribute value, where all
       key-value combinations across all given items define a "selection" of
       to-be-balanced samples or features.
     apply_selection : bool
       Flag whether the balanced selection shall be applied, i.e. the output
       dataset only contains selected elements. If False, the selection is
       instead added as an attribute that merely marks selected elements (see
       ``space`` argument).
     include_offlimit : bool
       If True, all samples that were off limit (i.e. not included in the
       balancing input are included in the balanced selection. If False
       (default) they are excluded.
     space : str
       Name of the selection marker attribute in the output dataset that is
       created if the balanced selection is not applied to the output dataset
       (see ``apply_selection`` argument).
     rng : int or RandomState, optional
       Integer to seed a new RandomState upon each call, or instance of the
       numpy.random.RandomState to be reused across calls. If None, the
       numpy.random singleton would be used
     """
     Node.__init__(self, space=space, **kwargs)
     self._amount = amount
     self._attr = attr
     self.count = count
     self._limit = limit
     self._include_offlimit = include_offlimit
     self._apply_selection = apply_selection
     self._rng = rng
Exemple #6
0
 def __init__(self, attr, count=1, limit=None, assure=False, **kwargs):
     """
     Parameters
     ----------
     attr : str or list(str)
       Name of the to-be-permuted attribute. This can also be a list of
       attribute names, in which case the *identical* shuffling is applied to
       all listed attributes.
     count : int
       Number of permutations to yielded by .generate()
     limit : None or str or dict
       If ``None`` all attribute values will be permuted. If an single
       attribute name is given, its unique values will be used to define
       chunks of data that are permuted individually (i.e. no attributed
       values will be replaced across chunks). Finally, if a dictionary is
       provided, its keys define attribute names and its values (single value
       or sequence thereof) attribute value, where all key-value combinations
       across all given items define a "selection" of to-be-permuted samples
       or features.
     assure : bool
       If set, by-chance non-permutations will be prevented, i.e. it is
       checked that at least two items change their position. Since this
       check adds a runtime penalty it is off by default.
     """
     Node.__init__(self, **kwargs)
     self._pattr = attr
     self.nruns = count
     self._limit = limit
     self._pcfg = None
     self._assure_permute = assure
Exemple #7
0
    def __init__(self,
                 attr,
                 count=1,
                 limit=None,
                 assure=False,
                 strategy='simple',
                 chunk_attr=None,
                 rng=None,
                 **kwargs):
        """
        Parameters
        ----------
        attr : str or list(str)
          Name of the to-be-permuted attribute. This can also be a list of
          attribute names, in which case the *identical* shuffling is applied to
          all listed attributes.
        count : int
          Number of permutations to be yielded by .generate()
        limit : None or str or list or dict
          If ``None`` all attribute values will be permuted. If a single
          attribute name is given, its unique values will be used to define
          chunks of data that are permuted individually (i.e. no attributed
          values will be replaced across chunks). If a list given, then combination
          of those attributes per each sample is used together. Finally, if a dictionary is
          provided, its keys define attribute names and its values (single value
          or sequence thereof) attribute value, where all key-value combinations
          across all given items define a "selection" of to-be-permuted samples
          or features.
        strategy : 'simple', 'uattrs', 'chunks'
          'simple' strategy is the straightforward permutation of attributes (given
          the limit).  In some sense it assumes independence of those samples.
          'uattrs' strategy looks at unique values of attr (or their unique
          combinations in case of `attr` being a list), and "permutes" those
          unique combinations values thus breaking their assignment to the samples
          but preserving any dependencies between samples within the same unique
          combination. The 'chunks' strategy swaps attribute values of entire chunks.
          Naturally, this will only work if there is the same number of samples in
          all chunks.
        assure : bool
          If set, by-chance non-permutations will be prevented, i.e. it is
          checked that at least two items change their position. Since this
          check adds a runtime penalty it is off by default.
        rng : int or RandomState, optional
          Integer to seed a new RandomState upon each call, or instance of the
          numpy.random.RandomState to be reused across calls. If None, the
          numpy.random singleton would be used


        """
        Node.__init__(self, **kwargs)
        self._pattr = attr

        self.count = count
        self._limit = limit

        self._assure_permute = assure
        self.strategy = strategy
        self.rng = rng
        self.chunk_attr = chunk_attr
Exemple #8
0
 def __init__(self, count, space='repetitons', **kwargs):
     """
     Parameters
     ----------
     count : int
       Positive integer that set the numbed of repetitions.
     space : str
       The name of the dataset attribute that will hold the actual repetiton
       in the yielded datasets.
     """
     Node.__init__(self, space=space, **kwargs)
     self.count = count
Exemple #9
0
 def __init__(self, k,
              targets_attr,
              partitions_attr='partitions',
              partitions_keep=2,  # default for testing partition
              partition_assign=3, # assign one which Splitter doesn't even get to
              **kwargs):
     Node.__init__(self, **kwargs)
     self.k = k
     self.targets_attr = targets_attr
     self.partitions_attr = partitions_attr
     self.partitions_keep = partitions_keep
     self.partition_assign = partition_assign
Exemple #10
0
 def __init__(self, fx, space, **kwargs):
     """
     Parameters
     ----------
     fx : callable
       Callable that is passed with the dataset samples as first and
       attribute values as second argument.
     space : str
       name of the sample attribute that contains the target values.
     """
     Node.__init__(self, space=space, **kwargs)
     self.fx = fx
Exemple #11
0
 def __init__(self, fx, space, **kwargs):
     """
     Parameters
     ----------
     fx : callable
       Callable that is passed with the dataset samples as first and
       attribute values as second argument.
     space : str
       name of the sample attribute that contains the target values.
     """
     Node.__init__(self, space=space, **kwargs)
     self.fx = fx
Exemple #12
0
 def __init__(self, k,
              targets_attr,
              partitions_attr='partitions',
              partitions_keep=2,  # default for testing partition
              partition_assign=3, # assign one which Splitter doesn't even get to
              **kwargs):
     Node.__init__(self, **kwargs)
     self.k = k
     self.targets_attr = targets_attr
     self.partitions_attr = partitions_attr
     self.partitions_keep = partitions_keep
     self.partition_assign = partition_assign
Exemple #13
0
 def __init__(self, count, space='repetitons', **kwargs):
     """
     Parameters
     ----------
     count : int
       Positive integer that set the numbed of repetitions.
     space : str
       The name of the dataset attribute that will hold the actual repetiton
       in the yielded datasets.
     """
     Node.__init__(self, space=space, **kwargs)
     self.count = count
Exemple #14
0
    def __init__(self, attr, count=1, limit=None, assure=False,
                 strategy='simple', chunk_attr=None, rng=None, **kwargs):
        """
        Parameters
        ----------
        attr : str or list(str)
          Name of the to-be-permuted attribute. This can also be a list of
          attribute names, in which case the *identical* shuffling is applied to
          all listed attributes.
        count : int
          Number of permutations to be yielded by .generate()
        limit : None or str or list or dict
          If ``None`` all attribute values will be permuted. If a single
          attribute name is given, its unique values will be used to define
          chunks of data that are permuted individually (i.e. no attributed
          values will be replaced across chunks). If a list given, then combination
          of those attributes per each sample is used together. Finally, if a dictionary is
          provided, its keys define attribute names and its values (single value
          or sequence thereof) attribute value, where all key-value combinations
          across all given items define a "selection" of to-be-permuted samples
          or features.
        strategy : 'simple', 'uattrs', 'chunks'
          'simple' strategy is the straightforward permutation of attributes (given
          the limit).  In some sense it assumes independence of those samples.
          'uattrs' strategy looks at unique values of attr (or their unique
          combinations in case of `attr` being a list), and "permutes" those
          unique combinations values thus breaking their assignment to the samples
          but preserving any dependencies between samples within the same unique
          combination. The 'chunks' strategy swaps attribute values of entire chunks.
          Naturally, this will only work if there is the same number of samples in
          all chunks.
        assure : bool
          If set, by-chance non-permutations will be prevented, i.e. it is
          checked that at least two items change their position. Since this
          check adds a runtime penalty it is off by default.
        rng : int or RandomState, optional
          Integer to seed a new RandomState upon each call, or instance of the
          numpy.random.RandomState to be reused across calls. If None, the
          numpy.random singleton would be used


        """
        Node.__init__(self, **kwargs)
        self._pattr = attr

        self.count = count
        self._limit = limit

        self._assure_permute = assure
        self.strategy = strategy
        self.rng = rng
        self.chunk_attr = chunk_attr
Exemple #15
0
 def __init__(self, includes, *args, **kwargs):
     """
     Parameters
     ----------
     includes : list
       List of tuples rules (attribute, unique_values) where all
       listed 'unique_values' must be present in the dataset.
       Matching samples or features get selected to proceed to the
       next rule in the list.  If at some point not all listed
       values of the attribute are present, dataset does not pass
       through the 'Sifter'.
     """
     Node.__init__(self, *args, **kwargs)
     self._includes = includes
Exemple #16
0
 def __init__(self, includes, *args, **kwargs):
     """
     Parameters
     ----------
     includes : list
       List of tuples rules (attribute, uvalues) where all
       listed 'uvalues' must be present in the dataset.
       Matching samples or features get selected to proceed to the
       next rule in the list.  If at some point not all listed
       values of the attribute are present, dataset does not pass
       through the 'Sifter'.
       uvalues might also be a `dict`, see example above.
     """
     Node.__init__(self, *args, **kwargs)
     self._includes = includes
Exemple #17
0
 def __init__(self, space, prestrip, poststrip, **kwargs):
     """
     Parameters
     ----------
     space : str
       name of the sample attribute that shall be used to determine the
       boundaries.
     prestrip : int
       Number of samples to be stripped prior to each boundary.
     poststrip : int
       Number of samples to be stripped after each boundary (this includes
       the boundary sample itself, i.e. the first samples with a different
       sample attribute value).
     """
     Node.__init__(self, space=space, **kwargs)
     self._prestrip = prestrip
     self._poststrip = poststrip
Exemple #18
0
 def __init__(self, auto_train=False, force_train=False, **kwargs):
     """
     Parameters
     ----------
     auto_train : bool
       Flag whether the learner will automatically train itself on the input
       dataset when called untrained.
     force_train : bool
       Flag whether the learner will enforce training on the input dataset
       upon every call.
     **kwargs
       All arguments are passed to the baseclass.
     """
     Node.__init__(self, **kwargs)
     self.__is_trained = False
     self.__auto_train = auto_train
     self.__force_train = force_train
Exemple #19
0
 def __init__(self, auto_train=False, force_train=False, **kwargs):
     """
     Parameters
     ----------
     auto_train : bool
       Flag whether the learner will automatically train itself on the input
       dataset when called untrained.
     force_train : bool
       Flag whether the learner will enforce training on the input dataset
       upon every call.
     **kwargs
       All arguments are passed to the baseclass.
     """
     Node.__init__(self, **kwargs)
     self.__is_trained = False
     self.__auto_train = auto_train
     self.__force_train = force_train
Exemple #20
0
 def __init__(self, space, prestrip, poststrip, **kwargs):
     """
     Parameters
     ----------
     space : str
       name of the sample attribute that shall be used to determine the
       boundaries.
     prestrip : int
       Number of samples to be stripped prior to each boundary.
     poststrip : int
       Number of samples to be stripped after each boundary (this includes
       the boundary sample itself, i.e. the first samples with a different
       sample attribute value).
     """
     Node.__init__(self, space=space, **kwargs)
     self._prestrip = prestrip
     self._poststrip = poststrip
Exemple #21
0
 def __init__(self,
              attr,
              attr_values=None,
              count=None,
              noslicing=False,
              reverse=False,
              ignore_values=None,
              **kwargs):
     """
     Parameters
     ----------
     attr : str
       Typically the sample or feature attribute used to determine splits.
     attr_values : tuple
       If not None, this is a list of value of the ``attr`` used to determine
       the splits. The order of values in this list defines the order of the
       resulting splits. It is possible to specify a particular value
       multiple times. All dataset samples with values that are not listed
       are going to be ignored.
     count : None or int
       Desired number of generated splits. If None, all splits are output
       (default), otherwise the number of splits is limited to the given
       ``count`` or the maximum number of possible split (whatever is less).
     noslicing : bool
       If True, dataset splitting is not done by slicing (causing
       shared data between source and split datasets) even if it would
       be possible. By default slicing is performed whenever possible
       to reduce the memory footprint.
     reverse : bool
       If True, the order of datasets in the split is reversed, e.g.
       instead of (training, testing), (training, testing) will be spit
       out.
     ignore_values : tuple
       If not None, this is a list of value of the ``attr`` the shall be
       ignored when determining the splits. This settings also affects
       any specified ``attr_values``.
     """
     Node.__init__(self, space=attr, **kwargs)
     self.__splitattr_values = attr_values
     self.__splitattr_ignore = ignore_values
     self.__count = count
     self.__noslicing = noslicing
     self.__reverse = reverse
Exemple #22
0
 def __init__(
     self, attr, attr_values=None, count=None, noslicing=False, reverse=False, ignore_values=None, **kwargs
 ):
     """
     Parameters
     ----------
     attr : str
       Typically the sample or feature attribute used to determine splits.
     attr_values : tuple
       If not None, this is a list of value of the ``attr`` used to determine
       the splits. The order of values in this list defines the order of the
       resulting splits. It is possible to specify a particular value
       multiple times. All dataset samples with values that are not listed
       are going to be ignored.
     count : None or int
       Desired number of generated splits. If None, all splits are output
       (default), otherwise the number of splits is limited to the given
       ``count`` or the maximum number of possible split (whatever is less).
     noslicing : bool
       If True, dataset splitting is not done by slicing (causing
       shared data between source and split datasets) even if it would
       be possible. By default slicing is performed whenever possible
       to reduce the memory footprint.
     reverse : bool
       If True, the order of datasets in the split is reversed, e.g.
       instead of (training, testing), (training, testing) will be spit
       out.
     ignore_values : tuple
       If not None, this is a list of value of the ``attr`` the shall be
       ignored when determining the splits. This settings also affects
       any specified ``attr_values``.
     """
     Node.__init__(self, space=attr, **kwargs)
     self.__splitattr_values = attr_values
     self.__splitattr_ignore = ignore_values
     self.__count = count
     self.__noslicing = noslicing
     self.__reverse = reverse
Exemple #23
0
 def __init__(self,
              amount='equal',
              attr='targets',
              count=1,
              limit='chunks',
              apply_selection=False,
              include_offlimit=False,
              space='balanced_set',
              **kwargs):
     """
     Parameters
     ----------
     amount : {'equal'} or int or float
       Specify the amount of elements to be selected (within the current
       ``limit``). 
     """
     Node.__init__(self, space=space, **kwargs)
     self._amount = amount
     self._attr = attr
     self.count = count
     self._limit = limit
     self._limit_filter = None
     self._include_offlimit = include_offlimit
     self._apply_selection = apply_selection