Ejemplo n.º 1
0
def validate_data_interface(ds: smlb.Data) -> bool:
    """Tests for compliance with Data interface.

    Runs tests that every Data-compliant class should satisfy.

    Returns:
        True

    Raises:
        AssertionError for failed tests
    """

    # actual or "virtual" abc inheritance
    assert isinstance(ds, smlb.Data)

    if ds.num_samples == float("inf"):
        # infinite data tests
        pass

    else:
        # finite data test

        # integer-representable non-negative size
        assert int(ds.num_samples) == ds.num_samples
        assert ds.num_samples >= 0

        # all samples are returned
        assert len(ds.samples()) == ds.num_samples

        # subsets
        assert ds.subset([]).num_samples == 0
        assert ds.subset().num_samples <= ds.num_samples
        assert ds.subset(duplicates=True).num_samples == ds.num_samples

        # intersection with self
        assert smlb.intersection(ds, ds).num_samples <= ds.num_samples
        # assert smlb.intersection(ds, ds, duplicates=True).num_samples == ds.num_samples  # todo: support this as well

        # complement with self
        assert smlb.complement(ds, ds).num_samples == 0
        # assert smlb.complement(ds, ds, duplicates=True).num_samples == 0  # todo: support this as well

        if ds.is_labeled:
            # all labels are returned
            assert len(ds.labels()) == ds.num_samples

            # subsets
            assert ds.subset([]).is_labeled
            assert ds.subset().is_labeled

            # intersection
            assert smlb.intersection(ds, ds).is_labeled
            # assert smlb.intersection(ds, ds, duplicates=True).is_labeled  # todo: support this as well

            # complement
            assert smlb.complement(ds, ds).is_labeled
            # assert smlb.complement(ds, ds, duplicates=True).is_labeled  # todo: support this as well

    return True
Ejemplo n.º 2
0
    def finalize(self, data: Data) -> Data:
        """Change dataset according to registered failures and failure mode.

        Parameters:
            data: transformed Data

        Returns:
            Transformed Data after handling failures.
        """

        self.failures = sorted(list(set(
            self.failures)))  # remove duplicate indices

        if self.failmode == "raise":
            if len(self.failures) > 0:
                raise BenchmarkError(
                    "DataTransformation failed for some samples")
            return data
        elif self.failmode == "drop":
            return complement(data,
                              data.subset(self.failures))  # todo: duplicates?
        elif self.failmode == "mask":
            self.mask[self.failures] = True
            return data
        elif self.failmode == "index":
            self.index.extend(self.failures)
            return data

        raise BenchmarkError(
            f"Internal error, unrecognized failure mode '{self.failmode}'")
Ejemplo n.º 3
0
    def apply(self, data: Data, **kwargs) -> Data:
        """Draw random subset of data.

        Parameters:
            data: dataset to sample from

        Returns:
            random subset of data
        """

        data = params.instance(data, Data)
        if not data.is_finite:
            raise InvalidParameterError("finite Data", type(data).__name__)
        size = params.integer(
            self._size, from_=0, to=data.num_samples
        )  # validate upper bound (see __init__)

        ind = self.random.choice(data.num_samples, size=size, replace=False)

        return data.subset(ind)
Ejemplo n.º 4
0
    def apply(self, data: Data, **kwargs) -> Data:
        """Draw random vectors.

        Parameters:
            data: Data to draw from

        Returns:
            TabularData of vectors
        """

        data = params.instance(data, Data)
        if self._domain is None:
            if data.domain is None:
                domain = np.asarray([[0, 1]] * data.dimensions)
            else:
                domain = data.domain
        else:
            domain = params.hypercube_domain(
                self._domain, dimensions=data.dimensions
            )  # checks dimensionality (see __init__)

        for low, high in domain:
            if low == -np.inf or high == np.inf:
                raise BenchmarkError("can not sample from infinite domain")

        # vectors = np.transpose(
        #     np.asfarray(
        #         [
        #             self.random.uniform(low=low, high=high, size=self._size)
        #             for (low, high) in self._domain
        #         ]
        #     )
        # )

        # this version avoids the python loop for efficiency in high dimensions
        vectors = (
            self.random.uniform(size=(self._size, data.dimensions)) * (domain[:, 1] - domain[:, 0])
            + domain[:, 0]  # noqa W503
        )

        return data.subset(vectors)