def __init__(self, class_path: Optional[str] = None): """Initialize Java gateway. If derived class is initialized for the first time, start up JVM and create gateway. On subsequent initializations of derived class, the same gateway is used, except when a different class_path is passed. In that case, the JVM is shut down and restarted with the new class path. Parameters: class_path: local filesystem class path containing one or more directories or .jar files. If not specified, an empty string is passed as classpath to the JVM. Raises: BenchmarkError if the class_path is invalid. """ # todo: class_path = params.optional_(class_path, params.string) class_path = params.any_(class_path, params.string, params.none) if self.__class__._gateway is None: # first time derived class is instantiated, create gateway self._launch_gateway(class_path=class_path) elif self.__class__._class_path != class_path: # if parameters changed, restart the JVM self._shutdown_gateway() self._launch_gateway(class_path=class_path) else: # subsequent instantiations use the same gateway pass
def _indices_testf(self, indices: Sequence[Any]): return params.optional_( indices, lambda arg: list( params.any_( # NumPy indexing expects a list arg, lambda arg: params.tuple_(arg, None, arity=0), # empty set lambda arg: params.tuple_( arg, lambda arg: params.integer(arg, from_=0, below=self.num_samples) ), ) ), )
def noise(self, shape=None): """Add Gaussian noise to labels. Parameters: shape: shape of noise vector, matrix or higher-order tensor Returns: a numerical array of given shape containing independent identically distributed Gaussian noise Raises: InvalidParameterError: for invalid parameters """ # valid shape are either positive integer or a tuple of positive integer is_nonneg_int = lambda arg: params.integer(arg, from_=1) is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int) shape = params.any_(shape, is_nonneg_int, is_tuple) return self.random.normal(self._mean, self._stddev, size=shape)
def noise(self, shape=None): """Return no noise. A constant value is returned. Parameters: shape: shape of noise vector, matrix or higher-order tensor Returns: a numerical array of given shape containing a constant value Raises: InvalidParameterError: for invalid parameters """ # valid shape are either positive integer or a tuple of positive integer is_nonneg_int = lambda arg: params.integer(arg, from_=1) is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int) shape = params.any_(shape, is_nonneg_int, is_tuple) return np.full(shape, self._value)
def __init__( self, data: "pandas.DataFrame", # noqa F821 labels: Optional[Union["pandas.DataFrame", Sequence[str]]] = None, dtype: Optional[dict] = None, join: Optional[str] = None, filterf: Optional[Callable[[Any], bool]] = None, samplef: Optional[Callable[[Any], Any]] = None, labelf: Optional[Callable[[Any], Any]] = None, **kwargs, ): """Initialize dataset. Parameters control loading and preprocessing of the data. Order: 1. joining 2. filtering 3. sample and label transform Parameters: data: the samples in the form of a Pandas DataFrame. labels: the labels, either in the form of a Pandas DataFrame with same number of rows as data and different column names, or in the form of a list of column names, which are then split out from the data and used as labels. If not specified, the dataset is unlabeled. dtype: the NumPy data types to use for samples and labels, in the form of a dictionary with column names as keys and dtypes as values. Can be used to override dtype auto-detection for some or all columns. join: if specified, name of "column" to join by; this changes labels to be sequences of single-entry labels filterf: a function that accepts a sample and returns whether to keep it (True) or exclude it (False). Default retains all samples samplef: function accepting and returning a sample; applied to all samples as post-processing labelf: function accepting and returning a label; applied to all labels as post-processing Raises: InvalidParameterError for invalid arguments. In particular, numbers of data and labels must match. If column names are given, they must be unique across data and labels, if any. """ import pandas as pd # only import if class is used # parameter validation data = params.instance(data, pd.DataFrame) labels = params.optional_( labels, lambda arg: params.any_( arg, lambda arg: params.instance(arg, pd.DataFrame), # before tuple_ lambda arg: params.tuple_(arg, params.string), ), ) dtype = params.optional_(dtype, lambda arg: params.instance(arg, dict), default={}) join = params.optional_(join, params.string) singleargf = lambda arg: params.callable(arg, num_pos_or_kw=1) # noqa: E731 filterf = params.optional_(filterf, singleargf) samplef = params.optional_(samplef, singleargf) labelf = params.optional_(labelf, singleargf) if labels is None and labelf: raise InvalidParameterError( "matching labels and label function", "label function specified for unlabeled data" ) # process data data = data.reset_index(drop=True) # if labels are given as separate DataFrame, join them if isinstance(labels, pd.DataFrame): if len(data) != len(labels): raise InvalidParameterError( "matching data and labelsa", f"different number of rows ({len(data)} != {len(labels)})", ) labels = labels.reset_index(drop=True) col_names = np.hstack((data.columns.values, labels.columns.values)) if len(col_names) != len(np.unique(col_names)): raise InvalidParameterError( "unique column names", f"{data.columns.values} and {labels.columns.values}" ) data = pd.concat([data, labels], axis=1) labels = labels.columns.values # 1. optional joining if join: groups = data.groupby(join, sort=False, as_index=False) data = groups.aggregate(lambda tdf: tdf.tolist()) # 2. optional filtering if filterf: selection = data.apply(filterf, axis=1) data = data[selection] # split data and labels if labels is not None: # DataFrame column indexing requires list, not tuple data, labels = data.drop(columns=list(labels)), data[list(labels)] # 3. optional sample and label transform if samplef: data = data.apply(samplef, axis=1, result_type="reduce") if isinstance(data, pd.Series): data = pd.DataFrame(data, columns=["Samples"]) if labelf: labels = labels.apply(labelf, axis=1, result_type="reduce") if isinstance(labels, pd.Series): labels = pd.DataFrame(labels, columns=["Labels"]) # convert to NumPy structured array data = self._to_numpy(data, dtype=dtype) labels = self._to_numpy(labels, dtype=dtype) if labels is not None else None super().__init__(data=data, labels=labels, **kwargs)