Exemple #1
0
    def __init__(self, class_path: Optional[str] = None):
        """Initialize Java gateway.
        
        If derived class is initialized for the first time, 
        start up JVM and create gateway. On subsequent initializations
        of derived class, the same gateway is used, except when a 
        different class_path is passed. In that case,
        the JVM is shut down and restarted with the new class path.

        Parameters:
            class_path: local filesystem class path containing one
                or more directories or .jar files. If not specified,
                an empty string is passed as classpath to the JVM.
        
        Raises:
            BenchmarkError if the class_path is invalid.
        """

        # todo: class_path = params.optional_(class_path, params.string)
        class_path = params.any_(class_path, params.string, params.none)

        if self.__class__._gateway is None:
            # first time derived class is instantiated, create gateway
            self._launch_gateway(class_path=class_path)
        elif self.__class__._class_path != class_path:
            # if parameters changed, restart the JVM
            self._shutdown_gateway()
            self._launch_gateway(class_path=class_path)
        else:
            # subsequent instantiations use the same gateway
            pass
Exemple #2
0
 def _indices_testf(self, indices: Sequence[Any]):
     return params.optional_(
         indices,
         lambda arg: list(
             params.any_(  # NumPy indexing expects a list
                 arg,
                 lambda arg: params.tuple_(arg, None, arity=0),  # empty set
                 lambda arg: params.tuple_(
                     arg, lambda arg: params.integer(arg, from_=0, below=self.num_samples)
                 ),
             )
         ),
     )
Exemple #3
0
    def noise(self, shape=None):
        """Add Gaussian noise to labels.

        Parameters:
            shape: shape of noise vector, matrix or higher-order tensor 

        Returns:
            a numerical array of given shape containing independent
            identically distributed Gaussian noise

        Raises:
            InvalidParameterError: for invalid parameters
        """

        # valid shape are either positive integer or a tuple of positive integer
        is_nonneg_int = lambda arg: params.integer(arg, from_=1)
        is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int)
        shape = params.any_(shape, is_nonneg_int, is_tuple)

        return self.random.normal(self._mean, self._stddev, size=shape)
Exemple #4
0
    def noise(self, shape=None):
        """Return no noise.

        A constant value is returned.        

        Parameters:
            shape: shape of noise vector, matrix or higher-order tensor 

        Returns:
            a numerical array of given shape containing a constant value

        Raises:
            InvalidParameterError: for invalid parameters
        """

        # valid shape are either positive integer or a tuple of positive integer
        is_nonneg_int = lambda arg: params.integer(arg, from_=1)
        is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int)
        shape = params.any_(shape, is_nonneg_int, is_tuple)

        return np.full(shape, self._value)
Exemple #5
0
    def __init__(
        self,
        data: "pandas.DataFrame",  # noqa F821
        labels: Optional[Union["pandas.DataFrame", Sequence[str]]] = None,
        dtype: Optional[dict] = None,
        join: Optional[str] = None,
        filterf: Optional[Callable[[Any], bool]] = None,
        samplef: Optional[Callable[[Any], Any]] = None,
        labelf: Optional[Callable[[Any], Any]] = None,
        **kwargs,
    ):
        """Initialize dataset.

        Parameters control loading and preprocessing of the data. Order:
        1. joining
        2. filtering
        3. sample and label transform

        Parameters:
            data: the samples in the form of a Pandas DataFrame.
            labels: the labels, either in the form of a Pandas DataFrame with same number of rows
                as data and different column names, or in the form of a list of column names,
                which are then split out from the data and used as labels. If not specified,
                the dataset is unlabeled.
            dtype: the NumPy data types to use for samples and labels, in the form of a dictionary
                with column names as keys and dtypes as values. Can be used to override dtype
                auto-detection for some or all columns.
            join: if specified, name of "column" to join by; this changes labels
                to be sequences of single-entry labels
            filterf: a function that accepts a sample and returns whether to keep it
                (True) or exclude it (False). Default retains all samples
            samplef: function accepting and returning a sample; applied to all samples
                as post-processing
            labelf: function accepting and returning a label; applied to all labels
                as post-processing

        Raises:
            InvalidParameterError for invalid arguments. In particular,
                numbers of data and labels must match. If column names are given,
                they must be unique across data and labels, if any.
        """

        import pandas as pd  # only import if class is used

        # parameter validation
        data = params.instance(data, pd.DataFrame)
        labels = params.optional_(
            labels,
            lambda arg: params.any_(
                arg,
                lambda arg: params.instance(arg, pd.DataFrame),  # before tuple_
                lambda arg: params.tuple_(arg, params.string),
            ),
        )
        dtype = params.optional_(dtype, lambda arg: params.instance(arg, dict), default={})
        join = params.optional_(join, params.string)
        singleargf = lambda arg: params.callable(arg, num_pos_or_kw=1)  # noqa: E731
        filterf = params.optional_(filterf, singleargf)
        samplef = params.optional_(samplef, singleargf)
        labelf = params.optional_(labelf, singleargf)

        if labels is None and labelf:
            raise InvalidParameterError(
                "matching labels and label function", "label function specified for unlabeled data"
            )

        # process data
        data = data.reset_index(drop=True)

        # if labels are given as separate DataFrame, join them
        if isinstance(labels, pd.DataFrame):
            if len(data) != len(labels):
                raise InvalidParameterError(
                    "matching data and labelsa",
                    f"different number of rows ({len(data)} != {len(labels)})",
                )

            labels = labels.reset_index(drop=True)

            col_names = np.hstack((data.columns.values, labels.columns.values))
            if len(col_names) != len(np.unique(col_names)):
                raise InvalidParameterError(
                    "unique column names", f"{data.columns.values} and {labels.columns.values}"
                )

            data = pd.concat([data, labels], axis=1)
            labels = labels.columns.values

        # 1. optional joining
        if join:
            groups = data.groupby(join, sort=False, as_index=False)
            data = groups.aggregate(lambda tdf: tdf.tolist())

        # 2. optional filtering
        if filterf:
            selection = data.apply(filterf, axis=1)
            data = data[selection]

        # split data and labels
        if labels is not None:
            # DataFrame column indexing requires list, not tuple
            data, labels = data.drop(columns=list(labels)), data[list(labels)]

        # 3. optional sample and label transform
        if samplef:
            data = data.apply(samplef, axis=1, result_type="reduce")
            if isinstance(data, pd.Series):
                data = pd.DataFrame(data, columns=["Samples"])
        if labelf:
            labels = labels.apply(labelf, axis=1, result_type="reduce")
            if isinstance(labels, pd.Series):
                labels = pd.DataFrame(labels, columns=["Labels"])

        # convert to NumPy structured array
        data = self._to_numpy(data, dtype=dtype)
        labels = self._to_numpy(labels, dtype=dtype) if labels is not None else None

        super().__init__(data=data, labels=labels, **kwargs)