Esempio n. 1
0
 def test_slightly_different_domain(self):
     """ If test data has a slightly different domain then (with interpolation)
     we should obtain a similar classification score. """
     learner = LogisticRegressionLearner(preprocessors=[])
     for proc in PREPROCESSORS:
         # LR that can not handle unknown values
         train, test = separate_learn_test(self.collagen)
         train1 = proc(train)
         aucorig = AUC(TestOnTestData(train1, test, [learner]))
         test = destroy_atts_conversion(test)
         test = odd_attr(test)
         # a subset of points for training so that all test sets points
         # are within the train set points, which gives no unknowns
         train = Interpolate(points=getx(train)[1:-3])(
             train)  # make train capable of interpolation
         train = proc(train)
         # explicit domain conversion test to catch exceptions that would
         # otherwise be silently handled in TestOnTestData
         _ = Orange.data.Table(train.domain, test)
         aucnow = AUC(TestOnTestData(train, test, [learner]))
         self.assertAlmostEqual(aucnow, aucorig, delta=0.02)
         test = Interpolate(points=getx(test) - 1.)(test)  # also do a shift
         _ = Orange.data.Table(train.domain, test)  # explicit call again
         aucnow = AUC(TestOnTestData(train, test, [learner]))
         self.assertAlmostEqual(
             aucnow, aucorig, delta=0.05)  # the difference should be slight
 def commit(self):
     out = None
     self.Error.dxzero.clear()
     self.Error.too_many_points.clear()
     if self.data:
         if self.input_radio == 0:
             points = getx(self.data)
             out = Interpolate(points)(self.data)
         elif self.input_radio == 1:
             xs = getx(self.data)
             if not self.dx > 0:
                 self.Error.dxzero()
             else:
                 xmin = self.xmin if self.xmin is not None else np.min(xs)
                 xmax = self.xmax if self.xmax is not None else np.max(xs)
                 xmin, xmax = min(xmin, xmax), max(xmin, xmax)
                 reslength = abs(math.ceil((xmax - xmin) / self.dx))
                 if reslength < 10002:
                     points = np.arange(xmin, xmax, self.dx)
                     out = Interpolate(points)(self.data)
                 else:
                     self.Error.too_many_points(reslength)
         elif self.input_radio == 2 and self.data_points_interpolate is not None:
             out = self.data_points_interpolate(self.data)
     self.Outputs.interpolated_data.send(out)
 def test_unknown_elsewhere_different(self):
     data = Orange.data.Table("iris")
     with data.unlocked():
         data.X[0, 1] = np.nan
         data.X[1, 1] = np.nan
         data.X[1, 2] = np.nan
     im = Interpolate(getx(data))
     im.interpfn = interp1d_with_unknowns_numpy
     interpolated = im(data)
     self.assertAlmostEqual(interpolated.X[0, 1], 3.25)
     self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334)
     self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667)
     self.assertFalse(np.any(np.isnan(interpolated.X)))
     im.interpfn = interp1d_with_unknowns_scipy
     interpolated = im(data)
     self.assertAlmostEqual(interpolated.X[0, 1], 3.25)
     self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334)
     self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667)
     self.assertFalse(np.any(np.isnan(interpolated.X)))
     save_X = interpolated.X
     im.interpfn = interp1d_wo_unknowns_scipy
     interpolated = im(data)
     self.assertTrue(np.any(np.isnan(interpolated.X)))
     # parts without unknown should be the same
     np.testing.assert_almost_equal(data.X[2:], save_X[2:])
Esempio n. 4
0
    def test_slightly_different_domain(self):
        """ If test data has a slightly different domain then (with interpolation)
        we should obtain a similar classification score. """
        # rows full of unknowns make LogisticRegression undefined
        # we can obtain them, for example, with EMSC, if one of the badspectra
        # is a spectrum from the data
        learner = LogisticRegressionLearner(preprocessors=[_RemoveNaNRows()])

        for proc in PREPROCESSORS:
            if hasattr(proc, "skip_add_zeros"):
                continue
            # LR that can not handle unknown values
            train, test = separate_learn_test(self.collagen)
            train1 = proc(train)
            aucorig = AUC(TestOnTestData()(train1, test, [learner]))
            test = slightly_change_wavenumbers(test, 0.00001)
            test = odd_attr(test)
            # a subset of points for training so that all test sets points
            # are within the train set points, which gives no unknowns
            train = Interpolate(points=getx(train)[1:-3])(train)  # interpolatable train
            train = proc(train)
            # explicit domain conversion test to catch exceptions that would
            # otherwise be silently handled in TestOnTestData
            _ = Orange.data.Table(train.domain, test)
            aucnow = AUC(TestOnTestData()(train, test, [learner]))
            self.assertAlmostEqual(aucnow, aucorig, delta=0.02, msg="Preprocessor " + str(proc))
            test = Interpolate(points=getx(test) - 1.)(test)  # also do a shift
            _ = Orange.data.Table(train.domain, test)  # explicit call again
            aucnow = AUC(TestOnTestData()(train, test, [learner]))
            # the difference should be slight
            self.assertAlmostEqual(aucnow, aucorig, delta=0.05, msg="Preprocessor " + str(proc))
Esempio n. 5
0
 def test_predict_different_domain_interpolation(self):
     train, test = separate_learn_test(self.collagen)
     aucorig = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()]))
     test = Interpolate(points=getx(test) - 1.)(test) # other test domain
     train = Interpolate(points=getx(train))(train)  # make train capable of interpolation
     aucshift = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()]))
     self.assertAlmostEqual(aucorig, aucshift, delta=0.01)  # shift can decrease AUC slightly
     test = Cut(1000, 1700)(test)
     auccut1 = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()]))
     test = Cut(1100, 1600)(test)
     auccut2 = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()]))
     test = Cut(1200, 1500)(test)
     auccut3 = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()]))
     # the more we cut the lower precision we get
     self.assertTrue(aucorig > auccut1 > auccut2 > auccut3)
 def test_unknown_middle(self):
     data = Orange.data.Table("iris")
     # whole column in the middle should be interpolated
     with data.unlocked():
         data.X[:, 1] = np.nan
     interpolated = Interpolate(getx(data))(data)
     self.assertFalse(np.any(np.isnan(interpolated.X)))
 def test_out_of_band(self):
     data = Orange.data.Table("iris")
     interpolated = Interpolate(range(-1,
                                      len(data.domain.attributes) +
                                      1))(data)
     np.testing.assert_allclose(interpolated.X[:, 1:5], data.X)
     np.testing.assert_equal(interpolated.X[:, [0, -1]], np.nan)
 def setUpClass(cls):
     super().setUpClass()
     cls.iris = Table("iris")
     cls.collagen = Table("collagen")
     cls.normal_data = [cls.iris, cls.collagen]
     # dataset with a single attribute
     iris1 = Table(Domain(cls.iris.domain[:1]), cls.iris)
     # dataset without any attributes
     iris0 = Table(Domain([]), cls.iris)
     # data set with no lines
     empty = cls.iris[:0]
     # dataset with large blank regions
     irisunknown = Interpolate(np.arange(20))(cls.iris)
     cls.unknown_last_instance = cls.iris.copy()
     cls.unknown_last_instance.X[
         73] = NAN  # needs to be unknown after sampling and permutation
     # a data set with features with the same names
     sfdomain = Domain([ContinuousVariable("1"), ContinuousVariable("1")])
     cls.same_features = Table(sfdomain, [[0, 1]])
     # a data set with only infs
     cls.only_inf = iris1.copy()
     cls.only_inf.X *= np.Inf
     cls.strange_data = [
         iris1, iris0, empty, irisunknown, cls.unknown_last_instance,
         cls.same_features, cls.only_inf
     ]
 def test_permute(self):
     rs = np.random.RandomState(0)
     data = Orange.data.Table("iris")
     oldX = data.X
     #permute data
     p = rs.permutation(range(len(data.domain.attributes)))
     for i, a in enumerate(data.domain.attributes):
         a.name = str(p[i])
     data.X = data.X[:, p]
     interpolated = Interpolate(range(len(data.domain.attributes)))(data)
     np.testing.assert_allclose(interpolated.X, oldX)
     #also permute output
     p1 = rs.permutation(range(len(data.domain.attributes)))
     interpolated = Interpolate(p1)(data)
     np.testing.assert_allclose(interpolated.X, oldX[:, p1])
     Orange.data.domain.Variable._clear_all_caches()
 def setUpClass(cls):
     super().setUpClass()
     cls.iris = Table("iris")
     cls.titanic = Table("titanic")
     cls.collagen = Table("collagen")
     cls.normal_data = [cls.iris, cls.collagen]
     # dataset with a single attribute
     iris1 = cls.iris.transform(Domain(cls.iris.domain[:1]))
     # dataset without any attributes
     iris0 = cls.iris.transform(Domain([]))
     # data set with no lines
     empty = cls.iris[:0]
     # dataset with large blank regions
     irisunknown = Interpolate(np.arange(20))(cls.iris)
     cls.unknown_last_instance = cls.iris.copy()
     with cls.unknown_last_instance.unlocked():
         cls.unknown_last_instance.X[
             73] = NAN  # needs to be unknown after sampling and permutation
     # dataset with mixed unknowns
     cls.unknown_pts = cls.collagen.copy()
     with cls.unknown_pts.unlocked():
         cls.unknown_pts[5] = np.nan
         cls.unknown_pts[8:10] = np.nan
         cls.unknown_pts[15] = np.inf
     # a data set with only infs
     cls.only_inf = iris1.copy()
     with cls.only_inf.unlocked():
         cls.only_inf.X *= np.Inf
     cls.strange_data = [
         iris1, iris0, empty, irisunknown, cls.unknown_last_instance,
         cls.only_inf, cls.unknown_pts
     ]
Esempio n. 11
0
 def test_predict_savgol_another_interpolate(self):
     train, test = separate_learn_test(self.collagen)
     train = SavitzkyGolayFiltering(window=9, polyorder=2, deriv=2)(train)
     auc = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()]))
     train = Interpolate(points=getx(train))(train)
     aucai = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()]))
     self.assertAlmostEqual(auc, aucai, delta=0.02)
Esempio n. 12
0
 def test_predict_samename_domain_interpolation(self):
     train, test = separate_learn_test(self.collagen)
     aucorig = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()]))
     test = destroy_atts_conversion(test)
     train = Interpolate(points=getx(train))(train) # make train capable of interpolation
     auc = AUC(TestOnTestData()(train, test, [LogisticRegressionLearner()]))
     self.assertEqual(aucorig, auc)
 def test_permute(self):
     rs = np.random.RandomState(0)
     data = Orange.data.Table("iris")
     oldX = data.X
     #permute data
     p = rs.permutation(range(len(data.domain.attributes)))
     nattr = [Orange.data.ContinuousVariable(str(p[i]))
              for i, a in enumerate(data.domain.attributes)]
     data = Orange.data.Table.from_numpy(Orange.data.Domain(nattr),
                                         X=data.X[:, p])
     interpolated = Interpolate(range(len(data.domain.attributes)))(data)
     np.testing.assert_allclose(interpolated.X, oldX)
     #also permute output
     p1 = rs.permutation(range(len(data.domain.attributes)))
     interpolated = Interpolate(p1)(data)
     np.testing.assert_allclose(interpolated.X, oldX[:, p1])
 def test_domain_conversion(self):
     """Test whether a domain can be used for conversion."""
     data = Orange.data.Table("iris")
     interpolated = Interpolate([0.5, 1.5])(data)
     nt = Orange.data.Table.from_table(interpolated.domain, data)
     self.assertEqual(interpolated.domain, nt.domain)
     np.testing.assert_equal(interpolated.X, nt.X)
     np.testing.assert_equal(interpolated.Y, nt.Y)
Esempio n. 15
0
def test_time():
    fns = ["collagen", dust(), spectra20nea(), "peach_juice.dpt"]
    for fn in fns:
        print(fn)
        data = Table(fn)
        print(data.X.shape)
        data[0, 2] = np.nan
        t = time.time()
        interpolated = Interpolate(getx(data), handle_nans=False)(data)
        print("no nan", time.time() - t)
        t = time.time()
        intp = Interpolate(getx(data), handle_nans=True)
        intp.interpfn = interp1d_with_unknowns_numpy
        interpolated = intp(data)
        print("nan handling with numpy", time.time() - t)
        intp.interpfn = interp1d_with_unknowns_scipy
        interpolated = intp(data)
        print("nan handling with scipy", time.time() - t)
        assert(not np.any(np.isnan(interpolated.X)))
Esempio n. 16
0
 def test_predict_different_domain(self):
     train, test = separate_learn_test(self.collagen)
     test = Interpolate(points=getx(test) - 1)(test) # other test domain
     try:
         from Orange.data.table import DomainTransformationError
         with self.assertRaises(DomainTransformationError):
             LogisticRegressionLearner()(train)(test)
     except ImportError:  # until Orange 3.19
         aucdestroyed = AUC(TestOnTestData(train, test, [LogisticRegressionLearner()]))
         self.assertTrue(0.45 < aucdestroyed < 0.55)
 def test_floatname(self):
     data = Orange.data.Table("collagen.csv")
     f1, f2 = 20, 21
     c1, c2 = float(data.domain.attributes[f1].name), \
              float(data.domain.attributes[f2].name)
     avg = (c1 + c2)/2
     interpolated = Interpolate([avg])(data)
     av1 = interpolated.X.ravel()
     av2 = data.X[:, [20,21]].mean(axis=1)
     np.testing.assert_allclose(av1, av2)
 def test_unknown_elsewhere(self):
     data = Orange.data.Table("iris")
     data.X[0, 1] = np.nan
     data.X[1, 1] = np.nan
     data.X[1, 2] = np.nan
     im = Interpolate(getx(data))
     interpolated = im(data)
     self.assertAlmostEqual(interpolated.X[0, 1], 3.25)
     self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334)
     self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667)
     self.assertFalse(np.any(np.isnan(interpolated.X)))
 def test_unknown_elsewhere_different(self):
     data = Orange.data.Table("iris")
     data.X[0, 1] = np.nan
     data.X[1, 1] = np.nan
     data.X[1, 2] = np.nan
     im = Interpolate(getx(data))
     im.interpfn = interp1d_with_unknowns_numpy
     interpolated = im(data)
     self.assertAlmostEqual(interpolated.X[0, 1], 3.25)
     self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334)
     self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667)
     self.assertFalse(np.any(np.isnan(interpolated.X)))
     im.interpfn = interp1d_with_unknowns_scipy
     interpolated = im(data)
     self.assertAlmostEqual(interpolated.X[0, 1], 3.25)
     self.assertAlmostEqual(interpolated.X[1, 1], 3.333333333333334)
     self.assertAlmostEqual(interpolated.X[1, 2], 1.766666666666667)
     self.assertFalse(np.any(np.isnan(interpolated.X)))
     save_X = interpolated.X
     im.interpfn = interp1d_wo_unknowns_scipy
     interpolated = im(data)
     self.assertTrue(np.any(np.isnan(interpolated.X)))
     # parts without unknown should be the same
     np.testing.assert_almost_equal(data.X[2:], save_X[2:])
Esempio n. 20
0
 def setUpClass(cls):
     super().setUpClass()
     cls.iris = Orange.data.Table("iris")
     cls.whitelight = Orange.data.Table("whitelight.gsf")
     cls.whitelight_unknown = cls.whitelight.copy()
     cls.whitelight_unknown[0]["value"] = NAN
     # dataset with a single attribute
     cls.iris1 = Orange.data.Table(Orange.data.Domain(cls.iris.domain[:1]),
                                   cls.iris)
     # dataset without any attributes
     iris0 = Orange.data.Table(Orange.data.Domain([]), cls.iris)
     # dataset with large blank regions
     irisunknown = Interpolate(np.arange(20))(cls.iris)
     # dataset without any attributes, but XY
     whitelight0 = Orange.data.Table(
         Orange.data.Domain([], None, metas=cls.whitelight.domain.metas),
         cls.whitelight)
     cls.strange_data = [None, cls.iris1, iris0, irisunknown, whitelight0]
Esempio n. 21
0
 def setUpClass(cls):
     super().setUpClass()
     cls.iris = Orange.data.Table("iris")
     cls.whitelight = Orange.data.Table("whitelight.gsf")
     cls.whitelight_unknown = cls.whitelight.copy()
     with cls.whitelight_unknown.unlocked():
         cls.whitelight_unknown[0][0] = NAN
     # dataset with a single attribute
     cls.iris1 = cls.iris.transform(Orange.data.Domain(cls.iris.domain[:1]))
     # dataset without any attributes
     iris0 = cls.iris.transform(Orange.data.Domain([]))
     # dataset without rows
     empty = cls.iris[:0]
     # dataset with large blank regions
     irisunknown = Interpolate(np.arange(20))(cls.iris)
     # dataset without any attributes, but XY
     whitelight0 = cls.whitelight.transform(
         Orange.data.Domain([], None, metas=cls.whitelight.domain.metas))
     cls.strange_data = [None, cls.iris1, iris0, empty, irisunknown, whitelight0]
Esempio n. 22
0
 def test_predict_different_domain(self):
     train, test = separate_learn_test(self.collagen)
     test = Interpolate(points=getx(test) - 1)(test)  # other test domain
     with self.assertRaises(DomainTransformationError):
         logreg(train)(test)
Esempio n. 23
0
 def test_predict_different_domain(self):
     train, test = separate_learn_test(self.collagen)
     test = Interpolate(points=getx(test) - 1)(test)  # other test domain
     aucdestroyed = AUC(
         TestOnTestData(train, test, [LogisticRegressionLearner()]))
     self.assertTrue(0.45 < aucdestroyed < 0.55)
 def test_nofloatname(self):
     data = Orange.data.Table("iris")
     interpolated = Interpolate([0.5])(data)
     av1 = interpolated.X.ravel()
     av2 = data.X[:, :2].mean(axis=1)
     np.testing.assert_allclose(av1, av2)
def preprocessor_data(preproc):
    """
    Rerturn appropriate test file for a preprocessor.

    Very slow preprocessors should get smaller files.
    """
    if isinstance(preproc, ME_EMSC):
        return SMALLER_COLLAGEN
    return SMALL_COLLAGEN


# Preprocessors that work per sample and should return the same
# result for a sample independent of the other samples
PREPROCESSORS_INDEPENDENT_SAMPLES = [
    Interpolate(np.linspace(1000, 1700, 100)),
    SavitzkyGolayFiltering(window=9, polyorder=2, deriv=2),
    Cut(lowlim=1000, highlim=1800),
    GaussianSmoothing(sd=3.),
    Absorbance(),
    Transmittance(),
    Integrate(limits=[[900, 100], [1100, 1200], [1200, 1300]]),
    Integrate(methods=Integrate.Simple, limits=[[1100, 1200]]),
    Integrate(methods=Integrate.Baseline, limits=[[1100, 1200]]),
    Integrate(methods=Integrate.PeakMax, limits=[[1100, 1200]]),
    Integrate(methods=Integrate.PeakBaseline, limits=[[1100, 1200]]),
    Integrate(methods=Integrate.PeakAt, limits=[[1100]]),
    Integrate(methods=Integrate.PeakX, limits=[[1100, 1200]]),
    Integrate(methods=Integrate.PeakXBaseline, limits=[[1100, 1200]]),
    RubberbandBaseline(),
    LinearBaseline(),
 def test_same(self):
     """Interpolate values are original values."""
     data = Orange.data.Table("iris")
     interpolated = Interpolate(range(len(data.domain.attributes)))(data)
     np.testing.assert_allclose(interpolated.X, data.X)