def test_method(self): dom = discretize.DomainDiscretizer(self.table_class) self.assertEqual(len(dom[1].values), 4) dom = discretize.DomainDiscretizer(self.table_class, method=discretize.EqualWidth(n=2)) self.assertEqual(len(dom[1].values), 2)
def test_equalwidth_100_to_4(self): X = np.arange(101).reshape((101, 1)) table = data.Table(X) disc = discretize.EqualWidth(n=4) dvar = disc(table, table.domain[0]) self.assertEqual(len(dvar.values), 4) self.assertEqual(dvar.compute_value.points, [25, 50, 75])
def test_fixed(self): dom = discretize.DomainDiscretizer(self.table_no_class, method=discretize.EqualWidth(n=2), fixed={"Feature 2": [1, 11]}) self.assertEqual(len(dom.attributes), 2) self.assertEqual(dom[0].compute_value.points, [0.5]) self.assertEqual(dom[1].compute_value.points, [6])
def random_data(nrows, ncols): np.random.seed(42) x = np.random.randint(0, 2, (nrows, ncols)) col = np.random.randint(ncols) y = x[:nrows, col].reshape(nrows, 1) table = Table.from_numpy(None, x, y) table = preprocess.Discretize(discretize.EqualWidth(n=3))(table) return table
def random_data(nrows, ncols): np.random.seed(42) x = np.random.random_integers(0, 1, (nrows, ncols)) col = np.random.randint(ncols) y = x[:nrows, col].reshape(nrows, 1) table = Table(x, y) table = preprocess.Discretize(discretize.EqualWidth(n=3))(table) return table
def test_equalwidth_on_two_values(self): s = [0] * 50 + [1] * 50 random.shuffle(s) X = np.array(s).reshape((100, 1)) table = data.Table(X) disc = discretize.EqualWidth(n=4) dvar = disc(table, table.domain[0]) self.assertEqual(len(dvar.values), 4) self.assertEqual(dvar.compute_value.points, [0.25, 0.5, 0.75])
def random_data(nrows, ncols): np.random.seed(42) x = np.random.random_integers(1, 3, (nrows, ncols)) col = np.random.randint(ncols) y = x[:nrows, col].reshape(nrows, 1) table = Table(x, y) table = discretize.DiscretizeTable( table, method=discretize.EqualWidth(n=3)) return table
def test_bayes(self): x = np.random.random_integers(0, 1, (100, 5)) col = np.random.randint(5) y = x[:, col].copy().reshape(100, 1) t = Orange.data.Table(x, y) t = Orange.preprocess.Discretize(method=discretize.EqualWidth(n=3))(t) nb = Orange.classification.NaiveBayesLearner() res = Orange.evaluation.TestOnTrainingData(t, [nb]) np.testing.assert_almost_equal(CA(res), [1]) t.Y[-20:] = 1 - t.Y[-20:] res = Orange.evaluation.TestOnTrainingData(t, [nb]) self.assertGreaterEqual(CA(res)[0], 0.75) self.assertLess(CA(res)[0], 1)
def test_bayes(self): x = np.random.randint(2, size=(100, 5)) col = np.random.randint(5) y = x[:, col].copy().reshape(100, 1) t = Table(x, y) t = Discretize(method=discretize.EqualWidth(n=3))(t) nb = NaiveBayesLearner() res = TestOnTrainingData()(t, [nb]) np.testing.assert_almost_equal(CA(res), [1]) t.Y[-20:] = 1 - t.Y[-20:] res = TestOnTrainingData()(t, [nb]) self.assertGreaterEqual(CA(res)[0], 0.75) self.assertLess(CA(res)[0], 1)
def predict_wine_quality(table, n): #Make the continous varibles discrete disc = Discretize() disc.method = discretize.EqualWidth(n=n) table = disc(table) #Define domain feature_vars = list(table.domain[1:]) class_label_var = table.domain[0] wine_domain = Domain(feature_vars, class_label_var) table = Table.from_table(domain=wine_domain, source=table) #Construct learner and print results tree_learner = NNClassificationLearner(hidden_layer_sizes=(10, ), max_iter=4000) eval_results = CrossValidation(table, [tree_learner], k=10) print("Accuracy of cross validation: {:.3f}".format( scoring.CA(eval_results)[0])) print("AUC: {:.3f}".format(scoring.AUC(eval_results)[0]))
class TestCA(unittest.TestCase): def test_init(self): res = Results(nmethods=2, nrows=100) res.actual[:50] = 0 res.actual[50:] = 1 res.predicted = np.vstack((res.actual, res.actual)) np.testing.assert_almost_equal(CA(res), [1, 1]) res.predicted[0][0] = 1 np.testing.assert_almost_equal(CA(res), [0.99, 1]) res.predicted[1] = 1 - res.predicted[1] np.testing.assert_almost_equal(CA(res), [0.99, 0]) def test_call(self): res = Results(nmethods=2, nrows=100) res.actual[:50] = 0 res.actual[50:] = 1 res.predicted = np.vstack((res.actual, res.actual)) ca = CA() np.testing.assert_almost_equal(ca(res), [1, 1]) res.predicted[0][0] = 1 np.testing.assert_almost_equal(ca(res), [0.99, 1]) res.predicted[1] = 1 - res.predicted[1] np.testing.assert_almost_equal(ca(res), [0.99, 0]) def test_bayes(self): x = np.random.randint(2, size=(100, 5)) col = np.random.randint(5) y = x[:, col].copy().reshape(100, 1) t = Table(x, y) t = Discretize( method=discretize.EqualWidth(n=3))(t) nb = NaiveBayesLearner() res = TestOnTrainingData(t, [nb]) np.testing.assert_almost_equal(CA(res), [1]) t.Y[-20:] = 1 - t.Y[-20:] res = TestOnTrainingData(t, [nb]) self.assertGreaterEqual(CA(res)[0], 0.75) self.assertLess(CA(res)[0], 1)
def formatTable(tble): ''' Bins the data, one hot encodes the data :param tble: :return: data: tble with binned data, X: representation of data with one-hot-encoding, mapping: representations of what our one-hot-encoding is ''' # Discretization (binning) # https://docs.orange.biolab.si/3/data-mining-library/reference/preprocess.html print("Discretizing data") disc = Discretize() disc.method = discretize.EqualWidth(n=4) data = disc(tble) # print("Discretized table:\n{}\n\n".format(data)) print("One hot encoding data") X, mapping = OneHot.encode(data, include_class=True) sorted(mapping.items()) return data, X, mapping
Custom = namedtuple("Custom", ["points"]) METHODS = [(Default, ), (Leave, ), (MDL, ), (EqualFreq, ), (EqualWidth, ), (Remove, ), (Custom, )] _dispatch = { Default: lambda m, data, var: _dispatch[type(m.method)](m.method, data, var), Leave: lambda m, data, var: var, MDL: lambda m, data, var: disc.EntropyMDL()(data, var), EqualFreq: lambda m, data, var: disc.EqualFreq(m.k)(data, var), EqualWidth: lambda m, data, var: disc.EqualWidth(m.k)(data, var), Remove: lambda m, data, var: None, Custom: lambda m, data, var: disc.Discretizer.create_discretized_var( var, m.points) } # Variable discretization state DState = namedtuple( "DState", [ "method", # discretization method "points", # induced cut points "disc_var" ] # induced discretized variable
()), MethodDesc(Methods.MDL, "Entropy vs. MDL", "entropy", "Split values until MDL exceeds the entropy (Fayyad-Irani)\n" "(requires discrete class variable)", _mdl_discretization, ()), MethodDesc(Methods.EqualFreq, "Equal frequency, intervals: ", "equal freq, k={}", "Create bins with same number of instances", lambda data, var, k: disc.EqualFreq(k)(data, var), ("freq_spin", )), MethodDesc(Methods.EqualWidth, "Equal width, intervals: ", "equal width, k={}", "Create bins of the same width", lambda data, var, k: disc.EqualWidth(k)(data, var), ("width_spin", )), MethodDesc(Methods.Remove, "Remove", "remove", "Remove variable", lambda *_: None, ()), MethodDesc(Methods.Binning, "Natural binning, desired bins: ", "binning, desired={}", "Create bins with nice thresholds; " "try matching desired number of bins", lambda data, var, nbins: disc.Binning(nbins)(data, var), ("binning_spin", )), MethodDesc(Methods.FixedWidth, "Fixed width: ", "fixed width {}", "Create bins with the given width (not for time variables)",