def __init__(self, data: np.ndarray, labels: Optional[np.ndarray] = None, **kwargs): """Initialize dataset. Parameters: data: tabular data as a NumPy ndarray labels: tabular data as a NumPy ndarray. If not specified, dataset is unlabeled. Raises: InvalidParameterError for invalid arguments. In particular, numbers of data and labels must match. Examples: From numerical NumPy data: ``` TabularData(numpy.ndarray(...), ...) ``` From a Pandas DataFrame: ``` df = pandas.DataFrame(..., columns=[...]) TabularData(df.to_records(index=False), labels=...) ``` From mixed NumPy data, with column names (note use of tuples): ``` a = numpy.array([('a', 1), ('b', 2)], dtype=[('C', str), ('D', int)]) TabularData(a, ...) ``` """ # parameter validation data = params.instance(data, np.ndarray) labels = params.optional_(labels, lambda arg: params.instance(arg, np.ndarray)) if labels is not None: # number of samples and labels must match if data.shape[0] != labels.shape[0]: raise InvalidParameterError( "same number of samples and labels", f"{data.shape[0]} samples, {labels.shape[0]} labels", ) # uniqueness of "column" names, if any, is enforced by NumPy, # but only separately for data and labels if is_sequence(data.dtype.names) and is_sequence(labels.dtype.names): column_names = data.dtype.names + labels.dtype.names if len(column_names) != len(np.unique(column_names)): raise InvalidParameterError( "unique column names for samples and labels", column_names ) self._data, self._labels = data, labels super().__init__(**kwargs)
def _intersection( lhs: "TabularData", rhs: "TabularData", duplicates: bool = False ) -> "TabularData": """Specialized intersection. For labeled data, labels are compared as well. The datasets must be compatible in the sense that both are of type TabularData or derived, and either labeled or unlabeled. Parameters: lhs: one of the two datasets to intersect ('left hand side') rhs: one of the two datasets to intersect ('right hand side') duplicates: if False (default), the returned data do not contain duplicate entries; if True, duplicates are taken into account. Both inputs and labels have to match for duplicates. Returns: TabularData containing only samples in both datasets, either without duplicates (set intersection) or taking duplicates into account (multiset intersection) Raises: NotImplementedError if the set intersection can not be computed """ # parameter validation lhs = params.instance(lhs, TabularData) rhs = params.instance(rhs, TabularData) duplicates = params.boolean(duplicates) # special case: empty set if lhs.num_samples == 0: return lhs.subset() # copy if rhs.num_samples == 0: return rhs.subset() # copy if lhs.is_labeled != rhs.is_labeled: raise InvalidParameterError("compatible TabularData", "mismatch in labeling") # intersection calculation _lhs, _rhs = TabularData._joint_data_labels(lhs), TabularData._joint_data_labels(rhs) if _lhs.dtype != _rhs.dtype: raise InvalidParameterError( "Matching TabularData", f"{_lhs.dtype.descr} and {_rhs.dtype.descr}" ) if duplicates is False: _, indices, _ = np.intersect1d(_lhs, _rhs, return_indices=True) # drops any duplicates indices = np.sort(indices) # restores original order return lhs.subset(indices) else: # duplicates = True raise NotImplementedError( # todo: implement "specialized multiset intersection not implemented for TabularData" )
def complement(lhs: "Data", rhs: "Data", duplicates: bool = False) -> "Data": """(Multi)set complement of two datasets. This complement method does not retain duplicates by default. For multiset behaviour, specify 'duplicates=True'. Parameters: lhs: set A in A - B ('left hand side') rhs: set B in A - B ('right hand side') duplicates: if False (default), the returned data do not contain duplicate entries; if True, duplicates are taken into account. Both inputs and labels have to match for duplicates. Returns: Data containing all samples in lhs, but not in rhs, without duplicates """ # parameter validation lhs = params.instance(lhs, Data) rhs = params.instance(rhs, Data) # special case: empty set if lhs.num_samples == 0: return lhs.subset() if rhs.num_samples == 0: return lhs.subset() # try specialized implementations exception = None try: if hasattr(lhs.__class__, "_complement"): return lhs.__class__._complement(lhs, rhs, duplicates) except (NotImplementedError, InvalidParameterError) as e: exception = e try: if hasattr(rhs.__class__, "_complement"): return rhs.__class__._complement(lhs, rhs, duplicates) except (NotImplementedError, InvalidParameterError) as e: exception = e # no specialized method found or succeeded raise NotImplementedError( "generalized (multi)set complement not implemented") from exception
def apply(self, data: Data) -> Data: """Transforms data. Parameters: data: labeled data to transform Returns: transformed data Raises: InvalidParameterError if Data is not labeled """ data = params.instance(data, Data) if not data.is_labeled: raise InvalidParameterError("labeled data", "unlabeled data") # patch the labels() method of the data object (not class) # there is no need to store the old labels function as it is a class member, not an object member for name in ("_orig_labels", "labels", "_noise"): # patch if necessary by choosing a random name instead of _labels if name in data.__dict__: raise BenchmarkError(f"internal error: data object already has {name} method") # create a copy of the dataset data = copy.deepcopy(data) # rename labels to _labels for data only setattr(data, "_orig_labels", getattr(data, "labels")) # store noise model setattr(data, "_noise", self._noise) # add wrapper as new labels() method def labels(self, indices=None): """Query labels of a sequence of samples. This wrapper adds noise. Parameters: indices: a sequence of sample 'indices'. See 'samples()' for details. Returns: a sequence of labels """ labels = self._orig_labels(indices) return labels + self._noise.noise(labels.shape) setattr(data, "labels", labels.__get__(data)) return data
def __init__(self, noise: Noise, **kwargs): """Initialize state. Parameters: noise: noise model Returns: dataset with noisy labels """ super().__init__(**kwargs) self._noise = params.instance(noise, Noise)
def fit(self, data: Data) -> Learner: """Fits the model to training data. Parameters: data: labeled training data Returns: self (allows chaining) Raises: InvalidParameterError if data is not labeled """ data = params.instance(data, Labels) if not data.is_labeled: raise InvalidParameterError("Labeled data", "unlabeled data") return self
def _joint_data_labels(ds): """Single structured array for data and labels for comparison. Structured arrays can be used to run NumPy set methods on arrays with more than one dimension. """ ds = params.instance(ds, TabularData) if is_sequence(ds._data.dtype.names): # structured array lhs = ds._data else: # homogeneous array, possibly many dimensions lhs = np.reshape(ds._data, (ds.num_samples, -1)) lhs = lhs.view([("", ds._data.dtype)] * np.prod(lhs.shape[1:])) lhs = np.reshape(lhs, ds.num_samples) if not ds.is_labeled: result = lhs else: # is_labeled # alternatives for hstack() that did not work included # numpy.lib.recfunctions.merge_arrays. if is_sequence(ds._labels.dtype.names): # structured array rhs = ds._labels else: # homogeneous array, possibly high-dimensional rhs = np.reshape(ds._labels, (ds.num_samples, -1)) rhs = rhs.view([(str(i), rhs.dtype) for i in range(np.prod(rhs.shape[1:]))]) rhs = np.reshape(rhs, ds.num_samples) # lhs and rhs are structured array (views) now # unfortunately, np.hstack fails for these dtypes = lhs.dtype.descr + rhs.dtype.descr result = np.empty(ds.num_samples, dtype=dtypes) for name in lhs.dtype.names: result[name] = lhs[name] for name in rhs.dtype.names: result[name] = rhs[name] return result
def __init__( self, data: "pandas.DataFrame", # noqa F821 labels: Optional[Union["pandas.DataFrame", Sequence[str]]] = None, dtype: Optional[dict] = None, join: Optional[str] = None, filterf: Optional[Callable[[Any], bool]] = None, samplef: Optional[Callable[[Any], Any]] = None, labelf: Optional[Callable[[Any], Any]] = None, **kwargs, ): """Initialize dataset. Parameters control loading and preprocessing of the data. Order: 1. joining 2. filtering 3. sample and label transform Parameters: data: the samples in the form of a Pandas DataFrame. labels: the labels, either in the form of a Pandas DataFrame with same number of rows as data and different column names, or in the form of a list of column names, which are then split out from the data and used as labels. If not specified, the dataset is unlabeled. dtype: the NumPy data types to use for samples and labels, in the form of a dictionary with column names as keys and dtypes as values. Can be used to override dtype auto-detection for some or all columns. join: if specified, name of "column" to join by; this changes labels to be sequences of single-entry labels filterf: a function that accepts a sample and returns whether to keep it (True) or exclude it (False). Default retains all samples samplef: function accepting and returning a sample; applied to all samples as post-processing labelf: function accepting and returning a label; applied to all labels as post-processing Raises: InvalidParameterError for invalid arguments. In particular, numbers of data and labels must match. If column names are given, they must be unique across data and labels, if any. """ import pandas as pd # only import if class is used # parameter validation data = params.instance(data, pd.DataFrame) labels = params.optional_( labels, lambda arg: params.any_( arg, lambda arg: params.instance(arg, pd.DataFrame), # before tuple_ lambda arg: params.tuple_(arg, params.string), ), ) dtype = params.optional_(dtype, lambda arg: params.instance(arg, dict), default={}) join = params.optional_(join, params.string) singleargf = lambda arg: params.callable(arg, num_pos_or_kw=1) # noqa: E731 filterf = params.optional_(filterf, singleargf) samplef = params.optional_(samplef, singleargf) labelf = params.optional_(labelf, singleargf) if labels is None and labelf: raise InvalidParameterError( "matching labels and label function", "label function specified for unlabeled data" ) # process data data = data.reset_index(drop=True) # if labels are given as separate DataFrame, join them if isinstance(labels, pd.DataFrame): if len(data) != len(labels): raise InvalidParameterError( "matching data and labelsa", f"different number of rows ({len(data)} != {len(labels)})", ) labels = labels.reset_index(drop=True) col_names = np.hstack((data.columns.values, labels.columns.values)) if len(col_names) != len(np.unique(col_names)): raise InvalidParameterError( "unique column names", f"{data.columns.values} and {labels.columns.values}" ) data = pd.concat([data, labels], axis=1) labels = labels.columns.values # 1. optional joining if join: groups = data.groupby(join, sort=False, as_index=False) data = groups.aggregate(lambda tdf: tdf.tolist()) # 2. optional filtering if filterf: selection = data.apply(filterf, axis=1) data = data[selection] # split data and labels if labels is not None: # DataFrame column indexing requires list, not tuple data, labels = data.drop(columns=list(labels)), data[list(labels)] # 3. optional sample and label transform if samplef: data = data.apply(samplef, axis=1, result_type="reduce") if isinstance(data, pd.Series): data = pd.DataFrame(data, columns=["Samples"]) if labelf: labels = labels.apply(labelf, axis=1, result_type="reduce") if isinstance(labels, pd.Series): labels = pd.DataFrame(labels, columns=["Labels"]) # convert to NumPy structured array data = self._to_numpy(data, dtype=dtype) labels = self._to_numpy(labels, dtype=dtype) if labels is not None else None super().__init__(data=data, labels=labels, **kwargs)
def _complement( lhs: "TabularData", rhs: "TabularData", duplicates: bool = False ) -> "TabularData": """Specialized (multi)set complement. For labeled data, labels are compared as well. The datasets must be compatible in the sense that both are of type DataMatrix or derived, and either labeled or unlabeled. Parameters: lhs: set A in A - B ('left hand side') rhs: set B in A - B ('right hand side') duplicates: if False (default), the returned data do not contain duplicate entries; if True, duplicates are taken into account. Both inputs and labels have to match for duplicates. Returns: Data containing all samples in lhs, but not in rhs, either without duplicates (set complement) or taking duplicates into account (multiset complement). """ # parameter validation lhs = params.instance(lhs, TabularData) rhs = params.instance(rhs, TabularData) duplicates = params.boolean(duplicates) # special case: empty set if lhs.num_samples == 0: return lhs.subset() if rhs.num_samples == 0: return lhs.subset() if lhs.is_labeled != rhs.is_labeled: raise InvalidParameterError("compatible TabularData", "mismatch in labeling") # complement calculation _lhs, _rhs = TabularData._joint_data_labels(lhs), TabularData._joint_data_labels(rhs) if _lhs.dtype != _rhs.dtype: raise InvalidParameterError( "Matching TabularData", f"{_lhs.dtype.descr} and {_rhs.dtype.descr}" ) if duplicates is False: # np.setdiff1d does not return indices, so we don't use it indices = np.arange(_lhs.size)[np.isin(_lhs, _rhs, invert=True)] # indexes into _lhs _, indices2 = np.unique(_lhs[indices], return_index=True) # indexes into indices indices = indices[np.sort(indices2)] # restores order return lhs.subset(indices) # below implementation is correct but a bit slower: # # remove duplicates from _lhs # _, indices = np.unique(_lhs, return_index=True) # indices = np.sort(indices) # restores original order # _lhs = _lhs[indices] # # remove any element from _rhs # _, indices, _ = np.intersect1d(_lhs, _rhs, return_indices=True) # indices = np.setdiff1d(np.arange(_lhs.size), indices, assume_unique=True) else: # duplicates = True raise NotImplementedError( # todo: implement "specialized multiset complement not implemented for TabularData" )