Esempio n. 1
0
    def __init__(self, failmode, num_samples: int):
        """Initialize failure handler.

        Parameters:
            failmode: how to handle failed descriptor calculations, either due to rejected SMILES
                encodings or failing descriptor code. Possible values:
                "raise" [default]: raise a Benchmarexception
                "drop": drop the sample. Returned Data will have fewer samples
                ("mask", mask): where `mask` is a NumPy array with dtype bool whose entries will
                    be set to False for failures
                ("index", index): where `index` is an empty list to which the indices of failed
                    entries will be appended
            num_samples: number of samples that are transformed
        """

        self.num_samples = params.integer(num_samples, from_=0)
        self.failmode = self.failmode(failmode)

        if is_sequence(self.failmode) and self.failmode[0] == "mask":
            self.failmode = "mask"
            if len(failmode[1]) != self.num_samples:
                raise InvalidParameterError(
                    "failure mode mask length of {self.num_samples}",
                    len(self.mask))
            self.mask = failmode[1]
            self.mask.fill(False)

        if is_sequence(self.failmode) and self.failmode[0] == "index":
            self.failmode = "index"
            self.index = failmode[1]

        self.failures = []  # list of indices of failed samples
Esempio n. 2
0
    def __init__(self,
                 data: np.ndarray,
                 labels: Optional[np.ndarray] = None,
                 **kwargs):
        """Initialize dataset.

        Parameters:
            data: tabular data as a NumPy ndarray
            labels: tabular data as a NumPy ndarray. If not specified,
                dataset is unlabeled.

        Raises:
            InvalidParameterError for invalid arguments. In particular,
                numbers of data and labels must match.

        Examples:
            From numerical NumPy data:
            ```
            TabularData(numpy.ndarray(...), ...)
            ```

            From a Pandas DataFrame:
            ```
            df = pandas.DataFrame(..., columns=[...])
            TabularData(df.to_records(index=False), labels=...)
            ```

            From mixed NumPy data, with column names (note use of tuples):
            ```
            a = numpy.array([('a', 1), ('b', 2)], dtype=[('C', str), ('D', int)])
            TabularData(a, ...)
            ```
        """

        # parameter validation
        data = params.instance(data, np.ndarray)
        labels = params.optional_(labels,
                                  lambda arg: params.instance(arg, np.ndarray))

        if labels is not None:
            # number of samples and labels must match
            if data.shape[0] != labels.shape[0]:
                raise InvalidParameterError(
                    "same number of samples and labels",
                    f"{data.shape[0]} samples, {labels.shape[0]} labels",
                )

            # uniqueness of "column" names, if any, is enforced by NumPy,
            # but only separately for data and labels
            if is_sequence(data.dtype.names) and is_sequence(
                    labels.dtype.names):
                column_names = data.dtype.names + labels.dtype.names
                if len(column_names) != len(np.unique(column_names)):
                    raise InvalidParameterError(
                        "unique column names for samples and labels",
                        column_names)

        self._data, self._labels = data, labels

        super().__init__(**kwargs)
Esempio n. 3
0
    def normal_distribution(arg):
        """Predictive normal distribution.

        Parameters:
            arg: parameter to validate; normal predictive distributions;
                 a pair of two same-length sequences is interpreted as
                 means and standard deviations of independent normal predictive distributions

        Returns:
            NormalPredictiveDistribution

        Raises:
            InvalidParameterError: if arg is invalid
        """

        # due to circular dependency
        from .distributions import NormalPredictiveDistribution

        ipe = InvalidParameterError("normal distribution", arg)

        try:
            if isinstance(arg, NormalPredictiveDistribution):
                pass
            elif (is_sequence(arg) and len(arg) == 2 and is_sequence(arg[0])
                  and is_sequence(arg[1]) and len(arg[0]) == len(arg[1])):
                # interpret as pair of two same-length sequences
                arg = NormalPredictiveDistribution(arg[0], arg[1])
            else:
                raise ipe  # check if arg is a normal distribution
        except Exception as e:
            raise ipe from e

        return arg
Esempio n. 4
0
    def failmode(failmode):
        """Failure mode.

        Validate that argument is failure mode, similar to smlb.params.
        See __init__ for valid values.
        """

        ipe = InvalidParameterError("valid failure mode specification",
                                    failmode)

        if failmode in ("raise", "drop"):
            return failmode

        if not (is_sequence(failmode) and len(failmode) == 2):
            raise ipe

        if (failmode[0] == "mask" and isinstance(failmode[1], np.ndarray)
                and failmode[1].ndim == 1
                and failmode[1].dtype.name == "bool"):
            return failmode

        if failmode[0] == "index" and isinstance(failmode[1], list) and len(
                failmode[1]) == 0:
            return failmode

        raise ipe
Esempio n. 5
0
    def distribution(arg):
        """Predictive distribution.

        Parameters:
            arg: parameter to validate; predictive distributions;
                 a sequence is interpreted as specifying the means of a DeltaPredictiveDistribution

        Returns:
            PredictiveDistribution or subclass

        Raises:
            InvalidParameterError: if arg is invalid
        """

        # due to circular dependency
        from .distributions import PredictiveDistribution, DeltaPredictiveDistribution

        ipe = InvalidParameterError("distribution", arg)

        try:
            if isinstance(arg, PredictiveDistribution):
                pass
            elif is_sequence(arg):
                # interpret as sequence of means
                arg = np.asfarray(arg)
                if len(arg.shape) != 1:
                    raise ipe
                arg = DeltaPredictiveDistribution(arg)
            else:
                raise ipe
        except Exception as e:
            raise ipe from e

        return arg
Esempio n. 6
0
    def tuple_(arg, testf, *args, arity=None, default=NONE):
        """k-tuple meta-test.

        If arity is larger than the number of test functions provided,
        the last test function is repeatedly used. This enables
        `tuple(..., f, arity=3)` for homogeneous-type tuples.

        Parameters:
            arg: parameter to validate as a tuple
            testf: test function that accepts a single argument and validates it
            arbitrarily many further test functions can be passed
            arity: length of tuple
            default: if specified and arity as well, too-short tuples are extended with default value

        Returns:
            arg if it is a tuple and every component is successfully validated

        Raises:
            InvalidParameterError if arg is not a sequence or one of the test functions fails
        """

        if arity is None:
            arity = max(len(args) + 1, len(arg) if is_sequence(arg) else 0)
        ipe = InvalidParameterError(
            f"{arity}-tuple with valid components (tuple_)", arg)

        if not is_sequence(arg) or len(arg) > arity:
            raise ipe

        if len(arg) < arity:
            if default != params.NONE:
                arg = arg + tuple(default for _ in range(arity - len(arg)))
            else:
                raise ipe

        try:
            testf = (testf, *args)
            return tuple(testf[i if i < len(testf) else -1](arg[i])
                         for i in range(arity))
        except InvalidParameterError as e:
            raise ipe from e
Esempio n. 7
0
    def _joint_data_labels(ds):
        """Single structured array for data and labels for comparison.

        Structured arrays can be used to run NumPy set methods
        on arrays with more than one dimension.
        """

        ds = params.instance(ds, TabularData)

        if is_sequence(ds._data.dtype.names):  # structured array
            lhs = ds._data
        else:  # homogeneous array, possibly many dimensions
            lhs = np.reshape(ds._data, (ds.num_samples, -1))
            lhs = lhs.view([("", ds._data.dtype)] * np.prod(lhs.shape[1:]))
            lhs = np.reshape(lhs, ds.num_samples)

        if not ds.is_labeled:
            result = lhs
        else:  # is_labeled
            # alternatives for hstack() that did not work included
            # numpy.lib.recfunctions.merge_arrays.

            if is_sequence(ds._labels.dtype.names):  # structured array
                rhs = ds._labels
            else:  # homogeneous array, possibly high-dimensional
                rhs = np.reshape(ds._labels, (ds.num_samples, -1))
                rhs = rhs.view([(str(i), rhs.dtype)
                                for i in range(np.prod(rhs.shape[1:]))])
                rhs = np.reshape(rhs, ds.num_samples)

            # lhs and rhs are structured array (views) now
            # unfortunately, np.hstack fails for these
            dtypes = lhs.dtype.descr + rhs.dtype.descr
            result = np.empty(ds.num_samples, dtype=dtypes)
            for name in lhs.dtype.names:
                result[name] = lhs[name]
            for name in rhs.dtype.names:
                result[name] = rhs[name]

        return result
Esempio n. 8
0
File: plots.py Progetto: syam-s/smlb
    def render(self):
        """Renders evaluation.

        Specific derived classes should override `_render`, not this method.
        """

        target = self._target  # shortcut

        # if sequence of targets, render each of them
        if is_sequence(target) and not isinstance(target[0],
                                                  mpl.figure.Figure):
            for tgt in target:
                self.render(tgt)
            return

        # process single target
        if isinstance(target, mpl.axes.Axes):
            target = (plt.gcf(), target)

        # remember filename for export
        if isinstance(target, str):
            filename = target
            target = None
        else:
            filename = None

        # create new plot if necessary
        owner = False
        if target is None:
            owner = True
            target = plt.subplots()

        self._figax = target  # tuple(Figure, Axes)

        # set matplotlib plot settings
        # at this time, settings such as axes labels or scales contain the
        # correct values, but have not been set yet as the figure and axes
        # were just created. re-assignment sets (or 'activates') these values
        # for the new figure and axes.
        self.axes_labels = self.axes_labels
        self.axes_scales = self.axes_scales

        self._render(target)

        # export to filename if requested
        if filename is not None:
            self.fig.savefig(filename, bbox_inches="tight", pad_inches=0)

        # clean up if owner of Axes
        if owner:
            plt.close(self.fig)  # fig.clear() might not release all memory
            self._figax = None
Esempio n. 9
0
def test_is_sequence_examples():
    """Tests whether is_sequence complies to docstring via examples."""

    assert smlb.is_sequence([1, 2, 3]), "list"
    assert smlb.is_sequence((1, 2, 3)), "tuple"
    assert smlb.is_sequence(np.asfarray([1, 2, 3])), "array"

    assert not smlb.is_sequence("str"), "string"
    assert not smlb.is_sequence(b"bytes"), "bytes"
    assert not smlb.is_sequence(dict(a=1, b=2)), "dictionary"
    assert not smlb.is_sequence({1, 2, 3}), "set"
Esempio n. 10
0
    def sequence(arg, length=None, type_=None, testf=None):
        """Sequence.

        Sequence, of given length and type if specified.

        Parameters:
            arg: parameter to be validated as a sequence
            length: required length of sequence or None (default)
            type_: required type for all sequence elements or None (default)

        Returns:
            arg if a sequence

        Raises:
            InvalidParameterError if arg is not a sequence, of given length and type if specified
        """

        ipe_length = "" if length is None else f" of length {length}"
        ipe_type = "" if type_ is None else f" of type {type(type_).__name__}"
        ipe_testf = "" if testf is None else " with constraints"
        ipe = InvalidParameterError(
            f"a sequence{ipe_length}{ipe_type}{ipe_testf}", arg)

        if not is_sequence(arg):
            raise ipe
        if length is not None:
            if len(arg) != length:
                raise ipe
        if type_ is not None:
            if not all(isinstance(el, type_) for el in arg):
                raise ipe
        if testf is not None:
            try:
                for el in arg:
                    testf(el)
            except Exception as e:
                raise ipe from e

        return arg
Esempio n. 11
0
def test_orient():
    """Test orient argument for oriented metrics."""

    classes = (
        smlb.MeanAbsoluteError,
        smlb.MeanSquaredError,
        smlb.RootMeanSquaredError,
        smlb.MeanLogPredictiveDensity,
        smlb.MeanContinuousRankedProbabilityScore,
    )

    true = smlb.NormalPredictiveDistribution([1, 2, 3], [0.5, 0.6, 0.7])
    pred = smlb.NormalPredictiveDistribution([1.1, 2.2, 2.9], [0.4, 0.7, 0.65])
    for c in classes:
        resa, resb = c(orient=-1)(true, pred), c(orient=+1)(true, pred)
        if smlb.is_sequence(resa):
            assert (resa == -resb).all(), c.__name__
        else:
            assert resa == -resb, c.__name__

        with pytest.raises(Exception):
            c(orientt=-1)  # ensure misspelt argument raises
Esempio n. 12
0
    def evaluate(self, results, **kwargs):
        """Compute plot data for multiple generalized (set-valued) functions.

        Multiple curves C_1, ..., C_k can be drawn.
        Each curve C_i is specified by a non-empty sequence of 2-tuples,
        where the first value is location on horizontal axis, and the
        other value is a sequence of locations on the vertical axis.

        Each curve can be drawn in a different way (points, box-whisker).

        Parameters:
            results: sequence of generalized functions data (curve data).
                     Each datum is a sequence of tuples (x,fx), where
                     x is a real number and fx is a sequence of real numbers.

        Examples:
            # two curves sharing one horizontal location
            evaluate([
                [(1,(1,0.9,1.1)), (3,(2,))],  # curve 1
                [(1,(0.7,)), (2,(3.1,2.8)), (4,(5.5,7.3,6))], # curve 2
            ])
        """

        super().evaluate(results=results, **kwargs)

        # parameter validation

        tuple_testf = lambda arg: params.tuple_(arg, params.real, params.real_vector, arity=2)
        curve_testf = lambda arg: params.tuple_(arg, tuple_testf)
        results = params.tuple_(results, curve_testf)

        # _rectify evaluates to True if True or if > 0
        if len(results) > len(self.RECTIFY_DELTAS) and self._rectify:
            raise InvalidParameterError(
                f"at most {len(self.RECTIFY_DELTAS)} curves", f"{len(self.RECTIFY_DELTAS)} curves"
            )

        # finalize parameter validation for visualization_type
        if not is_sequence(self._visualization_type):
            self._visualization_type = (self._visualization_type,) * len(results)
        self._visualization_type = params.tuple_(
            self._visualization_type,
            lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}),
            arity=len(results),
            default="points",
        )

        # prepare plot

        # determine all distinct horizontal positons in the results data
        all_positions = np.unique([entry[0] for curve in results for entry in curve])

        # there is nothing to do without data to plot
        if len(all_positions) == 0:
            self._plotdata = []
            return

        # do not rectify if there is only a single horizontal position
        if len(all_positions) == 1 or self._rectify is False:
            self._rectify = 0.0

        # automatic determination of horizontal rectification factor
        #
        # the correct way to draw box-plots on a logarithmic horizontal axis is to have
        # different left-width and right-width of the boxes. However, matplotlib does not
        # support this. Because box widths are small compared to horizontal plot range,
        # it suffices to use the sum of left- and right-half widths.
        between_groups_spacing = 0.4
        in_group_spacing = 0.9  # box-whisker plots
        if self.axes_scales[0] == "linear":
            logf = lambda arg: arg
            powf = lambda arg: arg
        elif self.axes_scales[0] == "log":
            base = 10
            logf = lambda arg: np.log(arg) / np.log(base)
            powf = lambda arg: np.power(base, arg)

        if self._rectify is True:
            # diff(...) requires at least two horizontal locations; this is ensured above
            self._rectify = (
                between_groups_spacing * min(np.diff(logf(all_positions))) / len(results)
            )

        # determine positions
        self._plotdata = [None] * len(results)
        deltas = self.RECTIFY_DELTAS[len(results)] if self._rectify else np.zeros(len(results))
        for (i, curve) in enumerate(results):
            # point markers, every single point is drawn
            if self._visualization_type[i] == "points":
                positions = powf(
                    np.hstack(
                        [
                            logf(entry[0] * np.ones(len(entry[1]))) + deltas[i] * self._rectify / 2
                            for entry in curve
                        ]
                    )
                )
                values = np.hstack([entry[1] for entry in curve])
                self._plotdata[i] = np.transpose([positions, values])
            # box-whisker plots
            elif self._visualization_type[i] == "box-whisker":
                positions = np.asfarray(
                    [logf(entry[0]) + deltas[i] * self._rectify / 2 for entry in curve]
                )
                values = [entry[1] for entry in curve]
                # can't use rectify for width if 0; 1 is a wild guess
                # todo: if plot ranges have been set, a better default value could
                #       be 10% of horizontal plot range
                w = 1 if not self._rectify else self._rectify
                widths = powf((positions + w / 2) * in_group_spacing) - powf(
                    (positions - w / 2) * in_group_spacing
                )
                positions = powf(positions)
                self._plotdata[i] = (positions, values, widths)
            elif self._visualization_type[i] == "shaded-line":
                positions = np.asfarray([entry[0] for entry in curve])
                values = [entry[1] for entry in curve]
                self._plotdata[i] = (positions, values)
            else:
                raise BenchmarkError("internal error, unknown visualization type")