def test_whole_and_train_separete(self): """ Applying a preprocessor before spliting data into train and test and applying is just on train data should yield the same transformation of the test data. """ for proc in PREPROCESSORS_INDEPENDENT_SAMPLES: data = preprocessor_data(proc) _, test1 = separate_learn_test(proc(data)) train, test = separate_learn_test(data) train = proc(train) test_transformed = test.transform(train.domain) np.testing.assert_almost_equal(test_transformed.X, test1.X, err_msg="Preprocessor " + str(proc))
def test_slightly_different_domain(self): """ If test data has a slightly different domain then (with interpolation) we should obtain a similar classification score. """ # rows full of unknowns make LogisticRegression undefined # we can obtain them, for example, with EMSC, if one of the badspectra # is a spectrum from the data learner = LogisticRegressionLearner(max_iter=1000, preprocessors=[_RemoveNaNRows()]) for proc in PREPROCESSORS: if hasattr(proc, "skip_add_zeros"): continue # LR that can not handle unknown values train, test = separate_learn_test(preprocessor_data(proc)) train1 = proc(train) aucorig = AUC(TestOnTestData()(train1, test, [learner])) test = slightly_change_wavenumbers(test, 0.00001) test = odd_attr(test) # a subset of points for training so that all test sets points # are within the train set points, which gives no unknowns train = Interpolate(points=getx(train)[1:-3])( train) # interpolatable train train = proc(train) # explicit domain conversion test to catch exceptions that would # otherwise be silently handled in TestOnTestData _ = test.transform(train.domain) aucnow = AUC(TestOnTestData()(train, test, [learner])) self.assertAlmostEqual(aucnow, aucorig, delta=0.03, msg="Preprocessor " + str(proc)) test = Interpolate(points=getx(test) - 1.)(test) # also do a shift _ = test.transform(train.domain) # explicit call again aucnow = AUC(TestOnTestData()(train, test, [learner])) # the difference should be slight self.assertAlmostEqual(aucnow, aucorig, delta=0.05, msg="Preprocessor " + str(proc))