def validate_data_interface(ds: smlb.Data) -> bool: """Tests for compliance with Data interface. Runs tests that every Data-compliant class should satisfy. Returns: True Raises: AssertionError for failed tests """ # actual or "virtual" abc inheritance assert isinstance(ds, smlb.Data) if ds.num_samples == float("inf"): # infinite data tests pass else: # finite data test # integer-representable non-negative size assert int(ds.num_samples) == ds.num_samples assert ds.num_samples >= 0 # all samples are returned assert len(ds.samples()) == ds.num_samples # subsets assert ds.subset([]).num_samples == 0 assert ds.subset().num_samples <= ds.num_samples assert ds.subset(duplicates=True).num_samples == ds.num_samples # intersection with self assert smlb.intersection(ds, ds).num_samples <= ds.num_samples # assert smlb.intersection(ds, ds, duplicates=True).num_samples == ds.num_samples # todo: support this as well # complement with self assert smlb.complement(ds, ds).num_samples == 0 # assert smlb.complement(ds, ds, duplicates=True).num_samples == 0 # todo: support this as well if ds.is_labeled: # all labels are returned assert len(ds.labels()) == ds.num_samples # subsets assert ds.subset([]).is_labeled assert ds.subset().is_labeled # intersection assert smlb.intersection(ds, ds).is_labeled # assert smlb.intersection(ds, ds, duplicates=True).is_labeled # todo: support this as well # complement assert smlb.complement(ds, ds).is_labeled # assert smlb.complement(ds, ds, duplicates=True).is_labeled # todo: support this as well return True
def test_TabularData_set_intersection_correctness(get_matrix_data): """Tests for correctness of intersection for TabularData without duplicates.""" # unlabeled lhs = get_matrix_data(20).subset([0, 1, 2, 3]) rhs = get_matrix_data(20).subset([4, 5, 6]) intersection = smlb.intersection(lhs, rhs) assert (intersection.samples() == [[1, 2, 3.3], [7, 8, 9.9]]).all() # order # labeled (scalars) lhs = get_matrix_data(21).subset([0, 1, 2, 3]) rhs = get_matrix_data(21).subset([4, 5, 6]) intersection = smlb.intersection(lhs, rhs) assert (intersection.samples() == [[7, 8, 9.9]]).all() assert (intersection.labels() == [3]).all() # labeled (vectors) lhs = get_matrix_data(22).subset([0, 1, 2, 3]) rhs = get_matrix_data(22).subset([4, 5, 6]) intersection = smlb.intersection(lhs, rhs) assert (intersection.samples() == [[7, 8, 9.9]]).all() assert (intersection.labels() == [[3, 33]]).all() # mixed unlabeled lhs = get_matrix_data(23).subset([0, 1, 2, 3]) rhs = get_matrix_data(23).subset([4, 5, 6]) intersection = smlb.intersection(lhs, rhs) dt = lhs._data.dtype assert (intersection.samples() == np.array([(1, "b", 3.3), (7, "h", 9.9)], dtype=dt)).all() # mixed labeled lhs = get_matrix_data(24).subset([0, 1, 2, 3]) rhs = get_matrix_data(24).subset([4, 5, 6]) intersection = smlb.intersection(lhs, rhs) dt1, dt2 = lhs._data.dtype, lhs._labels.dtype assert (intersection.samples() == np.array([(7, "h", 9.9)], dtype=dt1)).all() assert (intersection.labels() == np.array([("c", 33)], dtype=dt2)).all()
def run(self): """Execute workflow.""" nlearn, ntrain = len(self._learners), len(self._training) ntotal = nlearn * ntrain self._progressf(0, ntotal) # 1) Validation data # sample validation data from dataset validation_data = self._validation.fit(self._data).apply(self._data) # remove validation data from dataset for finite datasets if self._data.is_finite: remaining_data = complement(self._data, validation_data) else: # infinite # any finite subset has measure zero remaining_data = self._data # 2) Training sets # sample training sets from remaining dataset training_data = tuple( sampler.fit(remaining_data).apply(remaining_data) for sampler in self._training ) # verify that the intersection between validation and all training sets is empty for train in training_data: # this assumes that both validation and training set are finite inters = intersection(train, validation_data) if inters.num_samples > 0: i, j, k = inters.num_samples, validation_data.num_samples, train.num_samples msg = f"Non-empty intersection between validation and training data ({i} shared samples out of {j} and {k})" raise BenchmarkError(msg) # 3) Featurization # featurize validation and training sets validation_data = self._features.fit(validation_data).apply(validation_data) training_data = tuple(self._features.fit(train).apply(train) for train in training_data) # 4) Training and prediction # train each learner on each training set and predict validation set predictions = np.empty((nlearn, ntrain), dtype=PredictiveDistribution) for i, learner in enumerate(self._learners): for j, training in enumerate(training_data): learner.fit(training) predictions[i, j] = learner.apply(validation_data) self._progressf(i * ntrain + j + 1, ntotal) # 1-based # 5) Evaluate results # compute evaluation metric for each run metric = np.asfarray( [ [ self._metric.evaluate(true=validation_data.labels(), pred=predictions[i, j]) for j in range(ntrain) ] for i in range(nlearn) ] ) # render each evaluation eval_data = [ [(train.num_samples, (metric[i, j],)) for j, train in enumerate(training_data)] for i, learner in enumerate(self._learners) ] for eval_ in self._evaluations: eval_.evaluate(eval_data) eval_.render()
def set_intersection_test(): """Run intersection code. Requires ds1, ds2 to have been set up.""" intersection = smlb.intersection(ds1, ds2) assert intersection.num_samples == 2