def test_slightly_different_domain(self): """ If test data has a slightly different domain then (with interpolation) we should obtain a similar classification score. """ learner = LogisticRegressionLearner(preprocessors=[]) for proc in PREPROCESSORS: # LR that can not handle unknown values train, test = separate_learn_test(self.collagen) train1 = proc(train) aucorig = AUC(TestOnTestData(train1, test, [learner])) test = destroy_atts_conversion(test) test = odd_attr(test) # a subset of points for training so that all test sets points # are within the train set points, which gives no unknowns train = Interpolate(points=getx(train)[1:-3])( train) # make train capable of interpolation train = proc(train) # explicit domain conversion test to catch exceptions that would # otherwise be silently handled in TestOnTestData _ = Orange.data.Table(train.domain, test) aucnow = AUC(TestOnTestData(train, test, [learner])) self.assertAlmostEqual(aucnow, aucorig, delta=0.02) test = Interpolate(points=getx(test) - 1.)(test) # also do a shift _ = Orange.data.Table(train.domain, test) # explicit call again aucnow = AUC(TestOnTestData(train, test, [learner])) self.assertAlmostEqual( aucnow, aucorig, delta=0.05) # the difference should be slight
def commit(self): out = None self.Error.dxzero.clear() self.Error.too_many_points.clear() if self.data: if self.input_radio == 0: points = getx(self.data) out = Interpolate(points)(self.data) elif self.input_radio == 1: xs = getx(self.data) if not self.dx > 0: self.Error.dxzero() else: xmin = self.xmin if self.xmin is not None else np.min(xs) xmax = self.xmax if self.xmax is not None else np.max(xs) xmin, xmax = min(xmin, xmax), max(xmin, xmax) reslength = abs(math.ceil((xmax - xmin) / self.dx)) if reslength < 10002: points = np.arange(xmin, xmax, self.dx) out = Interpolate(points)(self.data) else: self.Error.too_many_points(reslength) elif self.input_radio == 2 and self.data_points_interpolate is not None: out = self.data_points_interpolate(self.data) self.Outputs.interpolated_data.send(out)
def test_unknown_elsewhere_different(self): data = Orange.data.Table("iris") with data.unlocked(): data.X[0, 1] = np.nan data.X[1, 1] = np.nan data.X[1, 2] = np.nan im = Interpolate(getx(data)) im.interpfn = interp1d_with_unknowns_numpy interpolated = im(data) self.assertAlmostEqual(interpolated.X[0, 1], 3.25) self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334) self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667) self.assertFalse(np.any(np.isnan(interpolated.X))) im.interpfn = interp1d_with_unknowns_scipy interpolated = im(data) self.assertAlmostEqual(interpolated.X[0, 1], 3.25) self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334) self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667) self.assertFalse(np.any(np.isnan(interpolated.X))) save_X = interpolated.X im.interpfn = interp1d_wo_unknowns_scipy interpolated = im(data) self.assertTrue(np.any(np.isnan(interpolated.X))) # parts without unknown should be the same np.testing.assert_almost_equal(data.X[2:], save_X[2:])
def test_slightly_different_domain(self): """ If test data has a slightly different domain then (with interpolation) we should obtain a similar classification score. """ # rows full of unknowns make LogisticRegression undefined # we can obtain them, for example, with EMSC, if one of the badspectra # is a spectrum from the data learner = LogisticRegressionLearner(preprocessors=[_RemoveNaNRows()]) for proc in PREPROCESSORS: if hasattr(proc, "skip_add_zeros"): continue # LR that can not handle unknown values train, test = separate_learn_test(self.collagen) train1 = proc(train) aucorig = AUC(TestOnTestData()(train1, test, [learner])) test = slightly_change_wavenumbers(test, 0.00001) test = odd_attr(test) # a subset of points for training so that all test sets points # are within the train set points, which gives no unknowns train = Interpolate(points=getx(train)[1:-3])(train) # interpolatable train train = proc(train) # explicit domain conversion test to catch exceptions that would # otherwise be silently handled in TestOnTestData _ = Orange.data.Table(train.domain, test) aucnow = AUC(TestOnTestData()(train, test, [learner])) self.assertAlmostEqual(aucnow, aucorig, delta=0.02, msg="Preprocessor " + str(proc)) test = Interpolate(points=getx(test) - 1.)(test) # also do a shift _ = Orange.data.Table(train.domain, test) # explicit call again aucnow = AUC(TestOnTestData()(train, test, [learner])) # the difference should be slight self.assertAlmostEqual(aucnow, aucorig, delta=0.05, msg="Preprocessor " + str(proc))
def test_predict_different_domain_interpolation(self): train, test = separate_learn_test(self.collagen) aucorig = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()])) test = Interpolate(points=getx(test) - 1.)(test) # other test domain train = Interpolate(points=getx(train))(train) # make train capable of interpolation aucshift = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()])) self.assertAlmostEqual(aucorig, aucshift, delta=0.01) # shift can decrease AUC slightly test = Cut(1000, 1700)(test) auccut1 = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()])) test = Cut(1100, 1600)(test) auccut2 = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()])) test = Cut(1200, 1500)(test) auccut3 = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()])) # the more we cut the lower precision we get self.assertTrue(aucorig > auccut1 > auccut2 > auccut3)
def test_unknown_middle(self): data = Orange.data.Table("iris") # whole column in the middle should be interpolated with data.unlocked(): data.X[:, 1] = np.nan interpolated = Interpolate(getx(data))(data) self.assertFalse(np.any(np.isnan(interpolated.X)))
def test_out_of_band(self): data = Orange.data.Table("iris") interpolated = Interpolate(range(-1, len(data.domain.attributes) + 1))(data) np.testing.assert_allclose(interpolated.X[:, 1:5], data.X) np.testing.assert_equal(interpolated.X[:, [0, -1]], np.nan)
def setUpClass(cls): super().setUpClass() cls.iris = Table("iris") cls.collagen = Table("collagen") cls.normal_data = [cls.iris, cls.collagen] # dataset with a single attribute iris1 = Table(Domain(cls.iris.domain[:1]), cls.iris) # dataset without any attributes iris0 = Table(Domain([]), cls.iris) # data set with no lines empty = cls.iris[:0] # dataset with large blank regions irisunknown = Interpolate(np.arange(20))(cls.iris) cls.unknown_last_instance = cls.iris.copy() cls.unknown_last_instance.X[ 73] = NAN # needs to be unknown after sampling and permutation # a data set with features with the same names sfdomain = Domain([ContinuousVariable("1"), ContinuousVariable("1")]) cls.same_features = Table(sfdomain, [[0, 1]]) # a data set with only infs cls.only_inf = iris1.copy() cls.only_inf.X *= np.Inf cls.strange_data = [ iris1, iris0, empty, irisunknown, cls.unknown_last_instance, cls.same_features, cls.only_inf ]
def test_permute(self): rs = np.random.RandomState(0) data = Orange.data.Table("iris") oldX = data.X #permute data p = rs.permutation(range(len(data.domain.attributes))) for i, a in enumerate(data.domain.attributes): a.name = str(p[i]) data.X = data.X[:, p] interpolated = Interpolate(range(len(data.domain.attributes)))(data) np.testing.assert_allclose(interpolated.X, oldX) #also permute output p1 = rs.permutation(range(len(data.domain.attributes))) interpolated = Interpolate(p1)(data) np.testing.assert_allclose(interpolated.X, oldX[:, p1]) Orange.data.domain.Variable._clear_all_caches()
def setUpClass(cls): super().setUpClass() cls.iris = Table("iris") cls.titanic = Table("titanic") cls.collagen = Table("collagen") cls.normal_data = [cls.iris, cls.collagen] # dataset with a single attribute iris1 = cls.iris.transform(Domain(cls.iris.domain[:1])) # dataset without any attributes iris0 = cls.iris.transform(Domain([])) # data set with no lines empty = cls.iris[:0] # dataset with large blank regions irisunknown = Interpolate(np.arange(20))(cls.iris) cls.unknown_last_instance = cls.iris.copy() with cls.unknown_last_instance.unlocked(): cls.unknown_last_instance.X[ 73] = NAN # needs to be unknown after sampling and permutation # dataset with mixed unknowns cls.unknown_pts = cls.collagen.copy() with cls.unknown_pts.unlocked(): cls.unknown_pts[5] = np.nan cls.unknown_pts[8:10] = np.nan cls.unknown_pts[15] = np.inf # a data set with only infs cls.only_inf = iris1.copy() with cls.only_inf.unlocked(): cls.only_inf.X *= np.Inf cls.strange_data = [ iris1, iris0, empty, irisunknown, cls.unknown_last_instance, cls.only_inf, cls.unknown_pts ]
def test_predict_savgol_another_interpolate(self): train, test = separate_learn_test(self.collagen) train = SavitzkyGolayFiltering(window=9, polyorder=2, deriv=2)(train) auc = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()])) train = Interpolate(points=getx(train))(train) aucai = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()])) self.assertAlmostEqual(auc, aucai, delta=0.02)
def test_predict_samename_domain_interpolation(self): train, test = separate_learn_test(self.collagen) aucorig = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()])) test = destroy_atts_conversion(test) train = Interpolate(points=getx(train))(train) # make train capable of interpolation auc = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()])) self.assertEqual(aucorig, auc)
def test_permute(self): rs = np.random.RandomState(0) data = Orange.data.Table("iris") oldX = data.X #permute data p = rs.permutation(range(len(data.domain.attributes))) nattr = [Orange.data.ContinuousVariable(str(p[i])) for i, a in enumerate(data.domain.attributes)] data = Orange.data.Table.from_numpy(Orange.data.Domain(nattr), X=data.X[:, p]) interpolated = Interpolate(range(len(data.domain.attributes)))(data) np.testing.assert_allclose(interpolated.X, oldX) #also permute output p1 = rs.permutation(range(len(data.domain.attributes))) interpolated = Interpolate(p1)(data) np.testing.assert_allclose(interpolated.X, oldX[:, p1])
def test_domain_conversion(self): """Test whether a domain can be used for conversion.""" data = Orange.data.Table("iris") interpolated = Interpolate([0.5, 1.5])(data) nt = Orange.data.Table.from_table(interpolated.domain, data) self.assertEqual(interpolated.domain, nt.domain) np.testing.assert_equal(interpolated.X, nt.X) np.testing.assert_equal(interpolated.Y, nt.Y)
def test_time(): fns = ["collagen", dust(), spectra20nea(), "peach_juice.dpt"] for fn in fns: print(fn) data = Table(fn) print(data.X.shape) data[0, 2] = np.nan t = time.time() interpolated = Interpolate(getx(data), handle_nans=False)(data) print("no nan", time.time() - t) t = time.time() intp = Interpolate(getx(data), handle_nans=True) intp.interpfn = interp1d_with_unknowns_numpy interpolated = intp(data) print("nan handling with numpy", time.time() - t) intp.interpfn = interp1d_with_unknowns_scipy interpolated = intp(data) print("nan handling with scipy", time.time() - t) assert(not np.any(np.isnan(interpolated.X)))
def test_predict_different_domain(self): train, test = separate_learn_test(self.collagen) test = Interpolate(points=getx(test) - 1)(test) # other test domain try: from Orange.data.table import DomainTransformationError with self.assertRaises(DomainTransformationError): LogisticRegressionLearner()(train)(test) except ImportError: # until Orange 3.19 aucdestroyed = AUC(TestOnTestData(train, test, [LogisticRegressionLearner()])) self.assertTrue(0.45 < aucdestroyed < 0.55)
def test_floatname(self): data = Orange.data.Table("collagen.csv") f1, f2 = 20, 21 c1, c2 = float(data.domain.attributes[f1].name), \ float(data.domain.attributes[f2].name) avg = (c1 + c2)/2 interpolated = Interpolate([avg])(data) av1 = interpolated.X.ravel() av2 = data.X[:, [20,21]].mean(axis=1) np.testing.assert_allclose(av1, av2)
def test_unknown_elsewhere(self): data = Orange.data.Table("iris") data.X[0, 1] = np.nan data.X[1, 1] = np.nan data.X[1, 2] = np.nan im = Interpolate(getx(data)) interpolated = im(data) self.assertAlmostEqual(interpolated.X[0, 1], 3.25) self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334) self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667) self.assertFalse(np.any(np.isnan(interpolated.X)))
def test_unknown_elsewhere_different(self): data = Orange.data.Table("iris") data.X[0, 1] = np.nan data.X[1, 1] = np.nan data.X[1, 2] = np.nan im = Interpolate(getx(data)) im.interpfn = interp1d_with_unknowns_numpy interpolated = im(data) self.assertAlmostEqual(interpolated.X[0, 1], 3.25) self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334) self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667) self.assertFalse(np.any(np.isnan(interpolated.X))) im.interpfn = interp1d_with_unknowns_scipy interpolated = im(data) self.assertAlmostEqual(interpolated.X[0, 1], 3.25) self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334) self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667) self.assertFalse(np.any(np.isnan(interpolated.X))) save_X = interpolated.X im.interpfn = interp1d_wo_unknowns_scipy interpolated = im(data) self.assertTrue(np.any(np.isnan(interpolated.X))) # parts without unknown should be the same np.testing.assert_almost_equal(data.X[2:], save_X[2:])
def setUpClass(cls): super().setUpClass() cls.iris = Orange.data.Table("iris") cls.whitelight = Orange.data.Table("whitelight.gsf") cls.whitelight_unknown = cls.whitelight.copy() cls.whitelight_unknown[0]["value"] = NAN # dataset with a single attribute cls.iris1 = Orange.data.Table(Orange.data.Domain(cls.iris.domain[:1]), cls.iris) # dataset without any attributes iris0 = Orange.data.Table(Orange.data.Domain([]), cls.iris) # dataset with large blank regions irisunknown = Interpolate(np.arange(20))(cls.iris) # dataset without any attributes, but XY whitelight0 = Orange.data.Table( Orange.data.Domain([], None, metas=cls.whitelight.domain.metas), cls.whitelight) cls.strange_data = [None, cls.iris1, iris0, irisunknown, whitelight0]
def setUpClass(cls): super().setUpClass() cls.iris = Orange.data.Table("iris") cls.whitelight = Orange.data.Table("whitelight.gsf") cls.whitelight_unknown = cls.whitelight.copy() with cls.whitelight_unknown.unlocked(): cls.whitelight_unknown[0][0] = NAN # dataset with a single attribute cls.iris1 = cls.iris.transform(Orange.data.Domain(cls.iris.domain[:1])) # dataset without any attributes iris0 = cls.iris.transform(Orange.data.Domain([])) # dataset without rows empty = cls.iris[:0] # dataset with large blank regions irisunknown = Interpolate(np.arange(20))(cls.iris) # dataset without any attributes, but XY whitelight0 = cls.whitelight.transform( Orange.data.Domain([], None, metas=cls.whitelight.domain.metas)) cls.strange_data = [None, cls.iris1, iris0, empty, irisunknown, whitelight0]
def test_predict_different_domain(self): train, test = separate_learn_test(self.collagen) test = Interpolate(points=getx(test) - 1)(test) # other test domain with self.assertRaises(DomainTransformationError): logreg(train)(test)
def test_predict_different_domain(self): train, test = separate_learn_test(self.collagen) test = Interpolate(points=getx(test) - 1)(test) # other test domain aucdestroyed = AUC( TestOnTestData(train, test, [LogisticRegressionLearner()])) self.assertTrue(0.45 < aucdestroyed < 0.55)
def test_nofloatname(self): data = Orange.data.Table("iris") interpolated = Interpolate([0.5])(data) av1 = interpolated.X.ravel() av2 = data.X[:, :2].mean(axis=1) np.testing.assert_allclose(av1, av2)
def preprocessor_data(preproc): """ Rerturn appropriate test file for a preprocessor. Very slow preprocessors should get smaller files. """ if isinstance(preproc, ME_EMSC): return SMALLER_COLLAGEN return SMALL_COLLAGEN # Preprocessors that work per sample and should return the same # result for a sample independent of the other samples PREPROCESSORS_INDEPENDENT_SAMPLES = [ Interpolate(np.linspace(1000, 1700, 100)), SavitzkyGolayFiltering(window=9, polyorder=2, deriv=2), Cut(lowlim=1000, highlim=1800), GaussianSmoothing(sd=3.), Absorbance(), Transmittance(), Integrate(limits=[[900, 100], [1100, 1200], [1200, 1300]]), Integrate(methods=Integrate.Simple, limits=[[1100, 1200]]), Integrate(methods=Integrate.Baseline, limits=[[1100, 1200]]), Integrate(methods=Integrate.PeakMax, limits=[[1100, 1200]]), Integrate(methods=Integrate.PeakBaseline, limits=[[1100, 1200]]), Integrate(methods=Integrate.PeakAt, limits=[[1100]]), Integrate(methods=Integrate.PeakX, limits=[[1100, 1200]]), Integrate(methods=Integrate.PeakXBaseline, limits=[[1100, 1200]]), RubberbandBaseline(), LinearBaseline(),
def test_same(self): """Interpolate values are original values.""" data = Orange.data.Table("iris") interpolated = Interpolate(range(len(data.domain.attributes)))(data) np.testing.assert_allclose(interpolated.X, data.X)