def __init__(self, count=None, selection_strategy='equidistant', attr='chunks', space='partitions', **kwargs): """ Parameters ---------- count : None or int Desired number of splits to be output. It is limited by the number of splits possible for a given splitter (e.g. `OddEvenSplitter` can have only up to 2 splits). If None, all splits are output (default). selection_strategy : str If `count` is not None, possible strategies are possible: 'first': First `count` splits are chosen; 'random': Random (without replacement) `count` splits are chosen; 'equidistant': Splits which are equidistant from each other. attr : str Sample attribute used to determine splits. space : str Name of the to be created sample attribute defining the partitions. In addition, a dataset attribute named '`space`_set' will be added to each output dataset, indicating the number of the partition set it corresponds to. """ Node.__init__(self, space=space, **kwargs) # pylint happyness block self.__splitattr = attr # we don't check it, thus no reason to make it private. # someone might find it useful to change post creation # TODO utilize such (or similar) policy through out the code self.count = count self._set_selection_strategy(selection_strategy)
def __init__(self, attr, count=1, limit=None, assure=False, **kwargs): """ Parameters ---------- attr : str or list(str) Name of the to-be-permuted attribute. This can also be a list of attribute names, in which case the *identical* shuffling is applied to all listed attributes. count : int Number of permutations to be yielded by .generate() limit : None or str or dict If ``None`` all attribute values will be permuted. If an single attribute name is given, its unique values will be used to define chunks of data that are permuted individually (i.e. no attributed values will be replaced across chunks). Finally, if a dictionary is provided, its keys define attribute names and its values (single value or sequence thereof) attribute value, where all key-value combinations across all given items define a "selection" of to-be-permuted samples or features. assure : bool If set, by-chance non-permutations will be prevented, i.e. it is checked that at least two items change their position. Since this check adds a runtime penalty it is off by default. """ Node.__init__(self, **kwargs) self._pattr = attr self.count = count self._limit = limit self._pcfg = None self._assure_permute = assure
def __init__(self, amount='equal', attr='targets', count=1, limit='chunks', apply_selection=False, include_offlimit=False, space='balanced_set', rng=None, **kwargs): """ Parameters ---------- amount : {'equal'} or int or float Specify the amount of elements to be selected (within the current ``limit``). The amount can be given as an integer value corresponding to the absolute number of elements per unique attribute (see ``attr``) value, as a float corresponding to the fraction of elements, or with the keyword 'equal'. In the latter case the number of to be selected elements is determined by the least number of available elements for any given unique attribute value within the current limit. attr : str Dataset attribute whose unique values define element classes that are to be balanced in number. count : int How many iterations to perform on ``generate()``. limit : None or str or dict If ``None`` the whole dataset is considered as one. If a single attribute name is given, its unique values will be used to define chunks of data that are balanced individually. Finally, if a dictionary is provided, its keys define attribute names and its values (single value or sequence thereof) attribute value, where all key-value combinations across all given items define a "selection" of to-be-balanced samples or features. apply_selection : bool Flag whether the balanced selection shall be applied, i.e. the output dataset only contains selected elements. If False, the selection is instead added as an attribute that merely marks selected elements (see ``space`` argument). include_offlimit : bool If True, all samples that were off limit (i.e. not included in the balancing input are included in the balanced selection. If False (default) they are excluded. space : str Name of the selection marker attribute in the output dataset that is created if the balanced selection is not applied to the output dataset (see ``apply_selection`` argument). rng : int or RandomState, optional Integer to seed a new RandomState upon each call, or instance of the numpy.random.RandomState to be reused across calls. If None, the numpy.random singleton would be used """ Node.__init__(self, space=space, **kwargs) self._amount = amount self._attr = attr self.count = count self._limit = limit self._include_offlimit = include_offlimit self._apply_selection = apply_selection self._rng = rng
def __init__(self, count=None, selection_strategy='equidistant', attr='chunks', space='partitions', **kwargs): """ Parameters ---------- count : None or int Desired number of splits to be output. It is limited by the number of splits possible for a given splitter (e.g. `OddEvenSplitter` can have only up to 2 splits). If None, all splits are output (default). selection_strategy : str If `count` is not None, possible strategies are possible: 'first': First `count` splits are chosen; 'random': Random (without replacement) `count` splits are chosen; 'equidistant': Splits which are equidistant from each other. attr : str Sample attribute used to determine splits. space : str Name of the to be created sample attribute defining the partitions. In addition, a dataset attribute named '``space``\_set' will be added to each output dataset, indicating the number of the partition set it corresponds to. """ Node.__init__(self, space=space, **kwargs) # pylint happyness block self.__attr = attr # we don't check it, thus no reason to make it private. # someone might find it useful to change post creation # TODO utilize such (or similar) policy through out the code self.count = count self._set_selection_strategy(selection_strategy)
def __init__(self, attr, count=1, limit=None, assure=False, **kwargs): """ Parameters ---------- attr : str or list(str) Name of the to-be-permuted attribute. This can also be a list of attribute names, in which case the *identical* shuffling is applied to all listed attributes. count : int Number of permutations to yielded by .generate() limit : None or str or dict If ``None`` all attribute values will be permuted. If an single attribute name is given, its unique values will be used to define chunks of data that are permuted individually (i.e. no attributed values will be replaced across chunks). Finally, if a dictionary is provided, its keys define attribute names and its values (single value or sequence thereof) attribute value, where all key-value combinations across all given items define a "selection" of to-be-permuted samples or features. assure : bool If set, by-chance non-permutations will be prevented, i.e. it is checked that at least two items change their position. Since this check adds a runtime penalty it is off by default. """ Node.__init__(self, **kwargs) self._pattr = attr self.nruns = count self._limit = limit self._pcfg = None self._assure_permute = assure
def __init__(self, attr, count=1, limit=None, assure=False, strategy='simple', chunk_attr=None, rng=None, **kwargs): """ Parameters ---------- attr : str or list(str) Name of the to-be-permuted attribute. This can also be a list of attribute names, in which case the *identical* shuffling is applied to all listed attributes. count : int Number of permutations to be yielded by .generate() limit : None or str or list or dict If ``None`` all attribute values will be permuted. If a single attribute name is given, its unique values will be used to define chunks of data that are permuted individually (i.e. no attributed values will be replaced across chunks). If a list given, then combination of those attributes per each sample is used together. Finally, if a dictionary is provided, its keys define attribute names and its values (single value or sequence thereof) attribute value, where all key-value combinations across all given items define a "selection" of to-be-permuted samples or features. strategy : 'simple', 'uattrs', 'chunks' 'simple' strategy is the straightforward permutation of attributes (given the limit). In some sense it assumes independence of those samples. 'uattrs' strategy looks at unique values of attr (or their unique combinations in case of `attr` being a list), and "permutes" those unique combinations values thus breaking their assignment to the samples but preserving any dependencies between samples within the same unique combination. The 'chunks' strategy swaps attribute values of entire chunks. Naturally, this will only work if there is the same number of samples in all chunks. assure : bool If set, by-chance non-permutations will be prevented, i.e. it is checked that at least two items change their position. Since this check adds a runtime penalty it is off by default. rng : int or RandomState, optional Integer to seed a new RandomState upon each call, or instance of the numpy.random.RandomState to be reused across calls. If None, the numpy.random singleton would be used """ Node.__init__(self, **kwargs) self._pattr = attr self.count = count self._limit = limit self._assure_permute = assure self.strategy = strategy self.rng = rng self.chunk_attr = chunk_attr
def __init__(self, count, space='repetitons', **kwargs): """ Parameters ---------- count : int Positive integer that set the numbed of repetitions. space : str The name of the dataset attribute that will hold the actual repetiton in the yielded datasets. """ Node.__init__(self, space=space, **kwargs) self.count = count
def __init__(self, k, targets_attr, partitions_attr='partitions', partitions_keep=2, # default for testing partition partition_assign=3, # assign one which Splitter doesn't even get to **kwargs): Node.__init__(self, **kwargs) self.k = k self.targets_attr = targets_attr self.partitions_attr = partitions_attr self.partitions_keep = partitions_keep self.partition_assign = partition_assign
def __init__(self, fx, space, **kwargs): """ Parameters ---------- fx : callable Callable that is passed with the dataset samples as first and attribute values as second argument. space : str name of the sample attribute that contains the target values. """ Node.__init__(self, space=space, **kwargs) self.fx = fx
def __init__(self, includes, *args, **kwargs): """ Parameters ---------- includes : list List of tuples rules (attribute, unique_values) where all listed 'unique_values' must be present in the dataset. Matching samples or features get selected to proceed to the next rule in the list. If at some point not all listed values of the attribute are present, dataset does not pass through the 'Sifter'. """ Node.__init__(self, *args, **kwargs) self._includes = includes
def __init__(self, includes, *args, **kwargs): """ Parameters ---------- includes : list List of tuples rules (attribute, uvalues) where all listed 'uvalues' must be present in the dataset. Matching samples or features get selected to proceed to the next rule in the list. If at some point not all listed values of the attribute are present, dataset does not pass through the 'Sifter'. uvalues might also be a `dict`, see example above. """ Node.__init__(self, *args, **kwargs) self._includes = includes
def __init__(self, space, prestrip, poststrip, **kwargs): """ Parameters ---------- space : str name of the sample attribute that shall be used to determine the boundaries. prestrip : int Number of samples to be stripped prior to each boundary. poststrip : int Number of samples to be stripped after each boundary (this includes the boundary sample itself, i.e. the first samples with a different sample attribute value). """ Node.__init__(self, space=space, **kwargs) self._prestrip = prestrip self._poststrip = poststrip
def __init__(self, auto_train=False, force_train=False, **kwargs): """ Parameters ---------- auto_train : bool Flag whether the learner will automatically train itself on the input dataset when called untrained. force_train : bool Flag whether the learner will enforce training on the input dataset upon every call. **kwargs All arguments are passed to the baseclass. """ Node.__init__(self, **kwargs) self.__is_trained = False self.__auto_train = auto_train self.__force_train = force_train
def __init__(self, attr, attr_values=None, count=None, noslicing=False, reverse=False, ignore_values=None, **kwargs): """ Parameters ---------- attr : str Typically the sample or feature attribute used to determine splits. attr_values : tuple If not None, this is a list of value of the ``attr`` used to determine the splits. The order of values in this list defines the order of the resulting splits. It is possible to specify a particular value multiple times. All dataset samples with values that are not listed are going to be ignored. count : None or int Desired number of generated splits. If None, all splits are output (default), otherwise the number of splits is limited to the given ``count`` or the maximum number of possible split (whatever is less). noslicing : bool If True, dataset splitting is not done by slicing (causing shared data between source and split datasets) even if it would be possible. By default slicing is performed whenever possible to reduce the memory footprint. reverse : bool If True, the order of datasets in the split is reversed, e.g. instead of (training, testing), (training, testing) will be spit out. ignore_values : tuple If not None, this is a list of value of the ``attr`` the shall be ignored when determining the splits. This settings also affects any specified ``attr_values``. """ Node.__init__(self, space=attr, **kwargs) self.__splitattr_values = attr_values self.__splitattr_ignore = ignore_values self.__count = count self.__noslicing = noslicing self.__reverse = reverse
def __init__( self, attr, attr_values=None, count=None, noslicing=False, reverse=False, ignore_values=None, **kwargs ): """ Parameters ---------- attr : str Typically the sample or feature attribute used to determine splits. attr_values : tuple If not None, this is a list of value of the ``attr`` used to determine the splits. The order of values in this list defines the order of the resulting splits. It is possible to specify a particular value multiple times. All dataset samples with values that are not listed are going to be ignored. count : None or int Desired number of generated splits. If None, all splits are output (default), otherwise the number of splits is limited to the given ``count`` or the maximum number of possible split (whatever is less). noslicing : bool If True, dataset splitting is not done by slicing (causing shared data between source and split datasets) even if it would be possible. By default slicing is performed whenever possible to reduce the memory footprint. reverse : bool If True, the order of datasets in the split is reversed, e.g. instead of (training, testing), (training, testing) will be spit out. ignore_values : tuple If not None, this is a list of value of the ``attr`` the shall be ignored when determining the splits. This settings also affects any specified ``attr_values``. """ Node.__init__(self, space=attr, **kwargs) self.__splitattr_values = attr_values self.__splitattr_ignore = ignore_values self.__count = count self.__noslicing = noslicing self.__reverse = reverse
def __init__(self, amount='equal', attr='targets', count=1, limit='chunks', apply_selection=False, include_offlimit=False, space='balanced_set', **kwargs): """ Parameters ---------- amount : {'equal'} or int or float Specify the amount of elements to be selected (within the current ``limit``). """ Node.__init__(self, space=space, **kwargs) self._amount = amount self._attr = attr self.count = count self._limit = limit self._limit_filter = None self._include_offlimit = include_offlimit self._apply_selection = apply_selection