Ejemplo n.º 1
0
def validate_data_interface(ds: smlb.Data) -> bool:
    """Tests for compliance with Data interface.

    Runs tests that every Data-compliant class should satisfy.

    Returns:
        True

    Raises:
        AssertionError for failed tests
    """

    # actual or "virtual" abc inheritance
    assert isinstance(ds, smlb.Data)

    if ds.num_samples == float("inf"):
        # infinite data tests
        pass

    else:
        # finite data test

        # integer-representable non-negative size
        assert int(ds.num_samples) == ds.num_samples
        assert ds.num_samples >= 0

        # all samples are returned
        assert len(ds.samples()) == ds.num_samples

        # subsets
        assert ds.subset([]).num_samples == 0
        assert ds.subset().num_samples <= ds.num_samples
        assert ds.subset(duplicates=True).num_samples == ds.num_samples

        # intersection with self
        assert smlb.intersection(ds, ds).num_samples <= ds.num_samples
        # assert smlb.intersection(ds, ds, duplicates=True).num_samples == ds.num_samples  # todo: support this as well

        # complement with self
        assert smlb.complement(ds, ds).num_samples == 0
        # assert smlb.complement(ds, ds, duplicates=True).num_samples == 0  # todo: support this as well

        if ds.is_labeled:
            # all labels are returned
            assert len(ds.labels()) == ds.num_samples

            # subsets
            assert ds.subset([]).is_labeled
            assert ds.subset().is_labeled

            # intersection
            assert smlb.intersection(ds, ds).is_labeled
            # assert smlb.intersection(ds, ds, duplicates=True).is_labeled  # todo: support this as well

            # complement
            assert smlb.complement(ds, ds).is_labeled
            # assert smlb.complement(ds, ds, duplicates=True).is_labeled  # todo: support this as well

    return True
Ejemplo n.º 2
0
def test_TabularData_set_intersection_correctness(get_matrix_data):
    """Tests for correctness of intersection for TabularData without duplicates."""

    # unlabeled

    lhs = get_matrix_data(20).subset([0, 1, 2, 3])
    rhs = get_matrix_data(20).subset([4, 5, 6])
    intersection = smlb.intersection(lhs, rhs)
    assert (intersection.samples() == [[1, 2, 3.3], [7, 8,
                                                     9.9]]).all()  # order

    # labeled (scalars)

    lhs = get_matrix_data(21).subset([0, 1, 2, 3])
    rhs = get_matrix_data(21).subset([4, 5, 6])
    intersection = smlb.intersection(lhs, rhs)
    assert (intersection.samples() == [[7, 8, 9.9]]).all()
    assert (intersection.labels() == [3]).all()

    # labeled (vectors)

    lhs = get_matrix_data(22).subset([0, 1, 2, 3])
    rhs = get_matrix_data(22).subset([4, 5, 6])
    intersection = smlb.intersection(lhs, rhs)
    assert (intersection.samples() == [[7, 8, 9.9]]).all()
    assert (intersection.labels() == [[3, 33]]).all()

    # mixed unlabeled

    lhs = get_matrix_data(23).subset([0, 1, 2, 3])
    rhs = get_matrix_data(23).subset([4, 5, 6])
    intersection = smlb.intersection(lhs, rhs)
    dt = lhs._data.dtype
    assert (intersection.samples() == np.array([(1, "b", 3.3), (7, "h", 9.9)],
                                               dtype=dt)).all()

    # mixed labeled

    lhs = get_matrix_data(24).subset([0, 1, 2, 3])
    rhs = get_matrix_data(24).subset([4, 5, 6])
    intersection = smlb.intersection(lhs, rhs)
    dt1, dt2 = lhs._data.dtype, lhs._labels.dtype
    assert (intersection.samples() == np.array([(7, "h", 9.9)],
                                               dtype=dt1)).all()
    assert (intersection.labels() == np.array([("c", 33)], dtype=dt2)).all()
Ejemplo n.º 3
0
    def run(self):
        """Execute workflow."""

        nlearn, ntrain = len(self._learners), len(self._training)
        ntotal = nlearn * ntrain
        self._progressf(0, ntotal)

        # 1) Validation data

        # sample validation data from dataset
        validation_data = self._validation.fit(self._data).apply(self._data)

        # remove validation data from dataset for finite datasets
        if self._data.is_finite:
            remaining_data = complement(self._data, validation_data)
        else:  # infinite
            # any finite subset has measure zero
            remaining_data = self._data

        # 2) Training sets

        # sample training sets from remaining dataset
        training_data = tuple(
            sampler.fit(remaining_data).apply(remaining_data) for sampler in self._training
        )

        # verify that the intersection between validation and all training sets is empty
        for train in training_data:
            # this assumes that both validation and training set are finite
            inters = intersection(train, validation_data)
            if inters.num_samples > 0:
                i, j, k = inters.num_samples, validation_data.num_samples, train.num_samples
                msg = f"Non-empty intersection between validation and training data ({i} shared samples out of {j} and {k})"
                raise BenchmarkError(msg)

        # 3) Featurization

        # featurize validation and training sets
        validation_data = self._features.fit(validation_data).apply(validation_data)
        training_data = tuple(self._features.fit(train).apply(train) for train in training_data)

        # 4) Training and prediction

        # train each learner on each training set and predict validation set
        predictions = np.empty((nlearn, ntrain), dtype=PredictiveDistribution)
        for i, learner in enumerate(self._learners):
            for j, training in enumerate(training_data):
                learner.fit(training)
                predictions[i, j] = learner.apply(validation_data)

                self._progressf(i * ntrain + j + 1, ntotal)  # 1-based

        # 5) Evaluate results

        # compute evaluation metric for each run
        metric = np.asfarray(
            [
                [
                    self._metric.evaluate(true=validation_data.labels(), pred=predictions[i, j])
                    for j in range(ntrain)
                ]
                for i in range(nlearn)
            ]
        )

        # render each evaluation
        eval_data = [
            [(train.num_samples, (metric[i, j],)) for j, train in enumerate(training_data)]
            for i, learner in enumerate(self._learners)
        ]
        for eval_ in self._evaluations:
            eval_.evaluate(eval_data)
            eval_.render()
Ejemplo n.º 4
0
    def set_intersection_test():
        """Run intersection code. Requires ds1, ds2 to have been set up."""

        intersection = smlb.intersection(ds1, ds2)
        assert intersection.num_samples == 2