def test_select_1(self): gini = Gini() s = SelectBestFeatures(method=gini, k=1) data2 = s(self.titanic) best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1] self.assertEqual(data2.domain.attributes[0], best)
def test_continuous_scores_on_discrete_features(self): data = Impute(self.auro_mpg) with self.assertRaises(ValueError): UnivariateLinearRegression(data) d1 = SelectBestFeatures(method=UnivariateLinearRegression)(data) self.assertEqual(len(d1.domain), len(data.domain))
def test_select_1(self): data = Table('titanic') gini = Gini() s = SelectBestFeatures(method=gini, k=1) data2 = s(data) best = max((gini(data, f), f) for f in data.domain.attributes)[1] self.assertEqual(data2.domain.attributes[0], best)
def test_select_threshold(self): data = Table('wine') anova = ANOVA() t = 30 data2 = SelectBestFeatures(method=anova, threshold=t)(data) self.assertTrue( all(anova(f, data) >= t for f in data2.domain.attributes))
def test_defaults(self): fs = SelectBestFeatures(k=3) data2 = fs(Impute(self.auro_mpg)) self.assertTrue(all(a.is_continuous for a in data2.domain.attributes)) data2 = fs(self.wine) self.assertTrue(all(a.is_continuous for a in data2.domain.attributes)) data2 = fs(self.titanic) self.assertTrue(all(a.is_discrete for a in data2.domain.attributes))
def test_defaults(self): fs = SelectBestFeatures(k=3) data2 = fs(Impute(Table('auto-mpg'))) self.assertTrue(all(isinstance(a, ContinuousVariable) for a in data2.domain.attributes)) data2 = fs(Table('wine')) self.assertTrue(all(isinstance(a, ContinuousVariable) for a in data2.domain.attributes)) data2 = fs(Table('titanic')) self.assertTrue(all(isinstance(a, DiscreteVariable) for a in data2.domain.attributes))
def test_defaults(self): fs = SelectBestFeatures(k=3) data2 = fs(Impute(Table('auto-mpg'))) self.assertTrue(all(a.is_continuous for a in data2.domain.attributes)) data2 = fs(Table('wine')) self.assertTrue(all(a.is_continuous for a in data2.domain.attributes)) data2 = fs(Table('titanic')) self.assertTrue(all(a.is_discrete for a in data2.domain.attributes))
def test_select_threshold(self): anova = ANOVA() t = 30 data2 = SelectBestFeatures(method=anova, threshold=t)(self.heart_disease) self.assertTrue( all( anova(self.heart_disease, f) >= t for f in data2.domain.attributes))
def test_select_2(self): gini = Gini() # 100th percentile = selection of top1 attribute sel1 = SelectBestFeatures(method=gini, k=1.0) data2 = sel1(self.titanic) best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1] self.assertEqual(data2.domain.attributes[0], best) # no k and no threshold, select all attributes sel2 = SelectBestFeatures(method=gini, k=0) data2 = sel2(self.titanic) self.assertEqual(len(data2.domain.attributes), len(self.titanic.domain.attributes)) # 31% = selection of top (out of 3) attributes sel3 = SelectBestFeatures(method=gini, k=0.31) data2 = sel3(self.titanic) self.assertEqual(len(data2.domain.attributes), 1) # 35% = selection of top (out of 3) attributes sel3 = SelectBestFeatures(method=gini, k=0.35) data2 = sel3(self.titanic) self.assertEqual(len(data2.domain.attributes), 1) # 1% = select one (out of 3) attributes sel3 = SelectBestFeatures(method=gini, k=0.01) data2 = sel3(self.titanic) self.assertEqual(len(data2.domain.attributes), 1) # number of selected attrs should be relative to number of current input attrs sel3 = SelectBestFeatures(method=gini, k=1.0) data2 = sel3(self.heart_disease) self.assertEqual(len(data2.domain.attributes), 13)
def test_discrete_scores_on_continuous_features(self): c = self.iris.columns for method in (Gini, Chi2): d1 = SelectBestFeatures(method=method)(self.iris) expected = \ (c.petal_length, c.petal_width, c.sepal_length, c.sepal_width) self.assertSequenceEqual(d1.domain.attributes, expected) scores = method(d1) self.assertEqual(len(scores), 4) score = method(d1, c.petal_length) self.assertIsInstance(score, float)
def test_mixed_features(self): data = Table('auto-mpg') data.X = Imputer().fit_transform(data.X) s = SelectBestFeatures(method=UnivariateLinearRegression(), k=2) data2 = s(data) self.assertEqual( sum(1 for f in data2.domain.attributes if isinstance(f, ContinuousVariable)), 2) self.assertEqual( sum(1 for f in data2.domain.attributes if isinstance(f, DiscreteVariable)), sum(1 for f in data.domain.attributes if isinstance(f, DiscreteVariable)))
def test_discrete_scores_on_continuous_features(self): c = self.iris.columns for method in (Gini(), Chi2()): d1 = SelectBestFeatures(method=method)(self.iris) expected = \ (c.petal_length, c.petal_width, c.sepal_length, c.sepal_width) self.assertSequenceEqual(d1.domain.attributes, expected) scores = method(d1) self.assertEqual(len(scores), 4) score = method(d1, c.petal_length) self.assertEqual(score.ndim, 0) # a scalar self.assertTrue(np.issubdtype(score.dtype, float))
def test_error_when_using_regression_score_on_classification_data(self): s = SelectBestFeatures(method=UnivariateLinearRegression(), k=3) with self.assertRaises(ValueError): s(self.wine)
def test_error(self): data = Table('wine') s = SelectBestFeatures(method=UnivariateLinearRegression(), k=3) with self.assertRaises(ValueError): s(data)