def discretize_ent(infilename,outfilename): """ Discretize features of data sets according to the MDL method proposed by [#fayyad1993]_. Necessitate Orange Python module to perform the discretization. Only discretize all continuous features of classification datasets. :param infilename: name of the input file (expecting an arff file) :type infilename: string :param outfilename: name of the output file :type outfilename: string """ data = OTable(infilename) disc=Disc() disc.method=EntropyMDL() data_ent = disc(data) # Manipulation of the discretized data for attr in data_ent.domain.attributes : #Reset renamed attributes name to original ones if (attr.name[0:2] == "D_"): attr.name = attr.name[2:] attr.values = [val.replace(',',";") for val in attr.values] # save the discretized data data_ent.save(outfilename)
def discretize_ent(infilename, outfilename): """ Discretize features of data sets according to the MDL method proposed by [#fayyad1993]_. Necessitate Orange Python module to perform the discretization. Only discretize all continuous features of classification datasets. :param infilename: name of the input file (expecting an arff file) :type infilename: string :param outfilename: name of the output file :type outfilename: string """ data = OTable(infilename) disc = Disc() disc.method = EntropyMDL() data_ent = disc(data) # Manipulation of the discretized data for attr in data_ent.domain.attributes: #Reset renamed attributes name to original ones if (attr.name[0:2] == "D_"): attr.name = attr.name[2:] attr.values = [val.replace(',', ";") for val in attr.values] # save the discretized data data_ent.save(outfilename)
def test_bayes(self): x = np.random.randint(2, size=(100, 5)) col = np.random.randint(5) y = x[:, col].copy().reshape(100, 1) t = Table.from_numpy(None, x, y) t = Discretize(method=discretize.EqualWidth(n=3))(t) nb = NaiveBayesLearner() res = TestOnTrainingData()(t, [nb]) np.testing.assert_almost_equal(CA(res), [1]) t.Y[-20:] = 1 - t.Y[-20:] res = TestOnTrainingData()(t, [nb]) self.assertGreaterEqual(CA(res)[0], 0.75) self.assertLess(CA(res)[0], 1)
class NaiveBayesLearner(Learner): """ Naive Bayes classifier. Works only with discrete attributes. By default, continuous attributes are discretized. Parameters ---------- preprocessors : list, optional (default="[Orange.preprocess.Discretize]") An ordered list of preprocessors applied to data before training or testing. """ name = 'naive bayes' preprocessors = [Discretize()] def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array( np.diag(contingency.get_contingency(table, table.domain.class_var))) return NaiveBayesModel(cont, class_freq, table.domain)
def test_learner_scorer_previous_transformation(self): learner = LogisticRegressionLearner() from Orange.preprocess import Discretize data = Discretize()(self.iris) scores = learner.score_data(data) # scores should be defined and positive self.assertTrue(np.all(scores > 0))
def set_data(self, data): if type(data) == SqlTable and data.approx_len() > LARGE_TABLE: data = data.sample_time(DEFAULT_SAMPLE_TIME) self.closeContext() self.data = data self.init_combos(self.data) self.information([0, 1, 2]) if not self.data: self.discrete_data = None return """ TODO: check if data.has_missing_class(): self.information(1, "Examples with missing classes were removed.") """ if any(attr.is_continuous for attr in data.domain): self.discrete_data = Discretize(method=EqualFreq(n=4))(data) else: self.discrete_data = self.data if self.data.domain.class_var is None: self.rb_colors.setDisabled(True) disc_class = False else: self.rb_colors.setDisabled(False) disc_class = self.data.domain.has_discrete_class self.rb_colors.group.button(2).setDisabled(not disc_class) self.bar_button.setDisabled(not disc_class) self.interior_coloring = bool(disc_class) self.openContext(self.data) # if we first received subset we now call setSubsetData to process it if self.unprocessed_subset_data: self.set_subset_data(self.unprocessed_subset_data) self.unprocessed_subset_data = None
def set_data(self, data): if type(data) == SqlTable and data.approx_len() > LARGE_TABLE: data = data.sample_time(DEFAULT_SAMPLE_TIME) self.closeContext() self.data = data self.init_combos(self.data) if self.data is None: self.discrete_data = None elif any(attr.is_continuous for attr in data.domain): self.discrete_data = Discretize( method=EqualFreq(n=4), discretize_classes=True)(data) else: self.discrete_data = self.data self.vizrank.stop_and_reset() self.vizrank_button.setEnabled( self.data is not None and len(self.data) > 1 \ and len(self.data.domain.attributes) >= 1) if self.data is None: return has_class = self.data.domain.class_var is not None self.rb_colors.setDisabled(not has_class) self.interior_coloring = \ self.CLASS_DISTRIBUTION if has_class else self.PEARSON self.openContext(self.data) # if we first received subset we now call setSubsetData to process it if self.unprocessed_subset_data: self.set_subset_data(self.unprocessed_subset_data) self.unprocessed_subset_data = None
def test_remove_constant(self): table = data.Table("iris") table[:, 0] = 1 discretize = Discretize(remove_const=True) new_table = discretize(table) self.assertNotEqual(len(table.domain.attributes), len(new_table.domain.attributes))
def discretizer(data): if any(attr.is_continuous for attr in chain(data.domain.variables, data.domain.metas)): discretize = Discretize( method=EqualFreq(n=4), remove_const=False, discretize_classes=True, discretize_metas=True) return discretize(data).to_dense() return data
class NaiveBayesLearner(Learner): """ Naive Bayes classifier. Works only with discrete attributes. By default, continuous attributes are discretized. Parameters ---------- preprocessors : list, optional (default="[Orange.preprocess.Discretize]") An ordered list of preprocessors applied to data before training or testing. """ preprocessors = [RemoveNaNColumns(), Discretize()] name = "naive bayes" def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array( np.diag(contingency.get_contingency(table, table.domain.class_var)) ) class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq)) log_cont_prob = [ np.log( (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + c.shape[0]) / class_prob[:, None] ) for c in cont ] return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
def set_data(self, data): self.closeContext() self.clear_messages() self.data = data self.disc_data = None self.selection = [] if data is not None: if len(data) < 2: self.Warning.not_enough_inst() elif data.Y.size == 0: self.Warning.no_class_var() else: remover = Remove(Remove.RemoveConstant) data = remover(data) disc_data = Discretize(method=EqualFreq())(data) if remover.attr_results["removed"]: self.Information.removed_cons_feat() if len(disc_data.domain.attributes) < 2: self.Warning.not_enough_vars() else: self.disc_data = disc_data self.feature_model.set_domain(self.disc_data and self.disc_data.domain) self.openContext(self.disc_data) self.apply() self.vizrank.button.setEnabled(self.disc_data is not None)
def test_keep_constant(self): table = data.Table('iris') table[:, 0] = 1 discretize = Discretize(remove_const=False) new_table = discretize(table) self.assertEqual(len(table.domain.attributes), len(new_table.domain.attributes))
def test_discretize_metas(self): table = data.Table('iris') domain = table.domain regr_domain = data.Domain(domain.attributes[:3], [], [domain.attributes[3], domain.class_var]) table = data.Table.from_table(regr_domain, table) discretize = Discretize(remove_const=False) new_table = discretize(table) self.assertIs(new_table.domain.metas[0], new_table.domain.metas[0]) self.assertIs(new_table.domain.metas[1], new_table.domain.metas[1]) discretize = Discretize(remove_const=False, discretize_metas=True) new_table = discretize(table) self.assertIsInstance(new_table.domain.metas[0], DiscreteVariable) self.assertIs(new_table.domain.metas[1], new_table.domain.metas[1])
def set_data(self, data): if type(data) == SqlTable and data.approx_len() > LARGE_TABLE: data = data.sample_time(DEFAULT_SAMPLE_TIME) self.closeContext() self.data = data self.areas = [] if self.data is None: self.attrs[:] = [] else: if any(attr.is_continuous for attr in data.domain): self.discrete_data = Discretize(method=EqualFreq(n=4))(data) else: self.discrete_data = self.data self.attrs[:] = [ var for var in chain(self.discrete_data.domain, ( var for var in self.data.domain.metas if var.is_discrete)) ] if self.attrs: self.attrX = self.attrs[0].name self.attrY = self.attrs[len(self.attrs) > 1].name else: self.attrX = self.attrY = None self.areas = self.selection = None self.openContext(self.data) self.resolve_shown_attributes() self.update_selection()
def create_contingencies(X, callback=None): window_size = 1 dim = len(X.domain) X_ = Discretize(method=EqualFreq(n=10))(X) m = [] for i, var in enumerate(X_.domain): cleaned_values = [ tuple(map(str.strip, v.strip('[]()<>=≥').split('-'))) for v in var.values ] try: float_values = [[float(v) for v in vals] for vals in cleaned_values] bin_centers = { i: v[0] if len(v) == 1 else v[0] + (v[1] - v[0]) for i, v in enumerate(float_values) } except ValueError: bin_centers = {i: i for i, v in enumerate(cleaned_values)} m.append(bin_centers) from Orange.data.sql.table import SqlTable if isinstance(X, SqlTable): conts = [] al = len(X.domain) if al > 1: conts.append(create_sql_contingency(X_, [0, 1], m)) if callback: callback(1, al) for a1, a2, a3 in zip(range(al), range(1, al), range(2, al)): conts.append(create_sql_contingency(X_, [a1, a2, a3], m)) if callback: callback(a3, al) if al > 2: conts.append(create_sql_contingency(X_, [al - 2, al - 1], m)) if callback: callback(al, al) else: conts = [defaultdict(float) for i in range(len(X_.domain))] for i, r in enumerate(X_): if any(np.isnan(r)): continue row = tuple(m[vi].get(v) for vi, v in enumerate(r)) for l in range(len(X_.domain)): lower = l - window_size if l - window_size >= 0 else None upper = l + window_size + 1 if l + window_size + 1 <= dim else None dims = slice(lower, upper) conts[l][row[dims]] += 1 conts = [zip(*x.items()) for x in conts] conts = [(np.array(c), np.array(cw)) for c, cw in conts] # for i, ((c1, cw1), (c2, cw2)) in enumerate(zip(contss, conts)): # a = np.sort(np.hstack((c1, cw1[:, None])), axis=0) # b = np.sort(np.hstack((c2, cw2[:, None])), axis=0) # assert_almost_equal(a, b) return conts
def test_preprocessors_can_be_passed_in_as_generator(self): """Since we support iterables, we should support generators as well""" pp = (Discretize(), ) learner = DummyLearnerPP(p for p in pp) self.assertEqual( tuple(learner.active_preprocessors), pp, 'Preprocessors should be able to be passed in as single object ' 'as well as an iterable object')
def test_preprocessors_can_be_passed_in_as_non_iterable(self): """For convenience, we can pass a single preprocessor instance""" pp = Discretize() learner = DummyLearnerPP(preprocessors=pp) self.assertEqual( tuple(learner.active_preprocessors), (pp, ), 'Preprocessors should be able to be passed in as single object ' 'as well as an iterable object')
def test_callback(self): callback = unittest.mock.Mock() learner = DummyLearner(preprocessors=[Discretize(), Randomize()]) learner(Table("iris"), callback) args = [x[0][0] for x in callback.call_args_list] self.assertEqual(min(args), 0) self.assertEqual(max(args), 1) self.assertListEqual(args, sorted(args))
def predict_wine_quality(table, n): #Make the continous varibles discrete disc = Discretize() disc.method = discretize.EqualWidth(n=n) table = disc(table) #Define domain feature_vars = list(table.domain[1:]) class_label_var = table.domain[0] wine_domain = Domain(feature_vars, class_label_var) table = Table.from_table(domain=wine_domain, source=table) #Construct learner and print results tree_learner = NNClassificationLearner(hidden_layer_sizes=(10, ), max_iter=4000) eval_results = CrossValidation(table, [tree_learner], k=10) print("Accuracy of cross validation: {:.3f}".format( scoring.CA(eval_results)[0])) print("AUC: {:.3f}".format(scoring.AUC(eval_results)[0]))
def test_overrides_custom_preprocessors(self): """Passing preprocessors to the learner constructor should override the default preprocessors defined on the learner""" pp = Discretize() learner = DummyLearnerPP(preprocessors=(pp, )) self.assertEqual( tuple(learner.active_preprocessors), (pp, ), 'Learner should override default preprocessors when specified in ' 'constructor')
def set_data(self, data): """ Discretize continuous attributes, and put all attributes and discrete metas into self.attrs. Select the first two attributes unless context overrides this. Method `resolve_shown_attributes` is called to use the attributes from the input, if it exists and matches the attributes in the data. Remove selection; again let the context override this. Initialize the vizrank dialog, but don't show it. Args: data (Table): input data """ if isinstance(data, SqlTable) and data.approx_len() > LARGE_TABLE: data = data.sample_time(DEFAULT_SAMPLE_TIME) self.closeContext() self.data = data self.areas = [] self.selection = set() if self.data is None: self.attrs[:] = [] self.domain_model.set_domain(None) else: self.domain_model.set_domain(data.domain) if any(attr.is_continuous for attr in chain(data.domain, data.domain.metas)): discretizer = Discretize(method=EqualFreq(n=4), remove_const=False, discretize_classes=True, discretize_metas=True) self.discrete_data = discretizer(data) else: self.discrete_data = data self.attrs = [x for x in self.domain_model if isinstance(x, Variable)] if self.attrs: self.attr_x = self.attrs[0] self.attr_y = self.attrs[len(self.attrs) > 1] else: self.attr_x = self.attr_y = None self.areas = [] self.selection = set() self.openContext(self.data) self.resolve_shown_attributes() self.update_graph() self.update_selection() self.vizrank.initialize() self.vizrank_button.setEnabled( self.data is not None and len(self.data) > 1 and len(self.data.domain.attributes) > 1)
def formatTable(tble): ''' Bins the data, one hot encodes the data :param tble: :return: data: tble with binned data, X: representation of data with one-hot-encoding, mapping: representations of what our one-hot-encoding is ''' # Discretization (binning) # https://docs.orange.biolab.si/3/data-mining-library/reference/preprocess.html print("Discretizing data") disc = Discretize() disc.method = discretize.EqualWidth(n=4) data = disc(tble) # print("Discretized table:\n{}\n\n".format(data)) print("One hot encoding data") X, mapping = OneHot.encode(data, include_class=True) sorted(mapping.items()) return data, X, mapping
def test_use_default_preprocessors_property(self): """We can specify that we want to use default preprocessors despite passing our own ones in the constructor""" learner = DummyLearnerPP(preprocessors=(Discretize(), )) learner.use_default_preprocessors = True preprocessors = list(learner.active_preprocessors) self.assertEqual( len(preprocessors), 2, 'Learner did not properly insert custom preprocessor into ' 'preprocessor list') self.assertIsInstance( preprocessors[0], Discretize, 'Custom preprocessor was inserted in incorrect order') self.assertIsInstance(preprocessors[1], Randomize)
def _get_discrete_data(self, data): """ Discretize continuous attributes. Return None when there is no data, no rows, or no primitive attributes. """ if (data is None or not len(data) or not any( attr.is_discrete or attr.is_continuous for attr in chain(data.domain.variables, data.domain.metas))): return None elif any(attr.is_continuous for attr in data.domain.variables): return Discretize(method=EqualFreq(n=4), remove_const=False, discretize_classes=True, discretize_metas=True)(data) else: return data
def setUp(self): cols = 1000 rows = 100 cont = [ContinuousVariable(str(i)) for i in range(cols)] disc = [ DiscreteVariable("D" + str(i), values=("1", "2")) for i in range(cols) ] self.domain = Domain(cont + disc) self.domain_x = Domain( list(self.domain.attributes) + [ContinuousVariable("x")]) self.single = Domain([ContinuousVariable("0")]) self.table = Table.from_numpy( self.domain, np.random.RandomState(0).randint(0, 2, (rows, len(self.domain)))) self.discretized_domain = Discretize(EqualFreq(n=3))(self.table).domain self.normalized_domain = Normalize()(self.table).domain
def create_contingencies(X, callback=None): window_size = 1 dim = len(X.domain) X_ = Discretize(method=EqualFreq(n=10))(X) m = get_bin_centers(X_) from Orange.data.sql.table import SqlTable if isinstance(X, SqlTable): conts = [] al = len(X.domain) if al > 1: conts.append(create_sql_contingency(X_, [0, 1], m)) if callback: callback(1, al) for a1, a2, a3 in zip(range(al), range(1, al), range(2, al)): conts.append(create_sql_contingency(X_, [a1, a2, a3], m)) if callback: callback(a3, al) if al > 2: conts.append(create_sql_contingency(X_, [al - 2, al - 1], m)) if callback: callback(al, al) else: conts = [defaultdict(float) for i in range(len(X_.domain))] for i, r in enumerate(X_): if any(np.isnan(r)): continue row = tuple(m[vi].get(v) for vi, v in enumerate(r)) for l in range(len(X_.domain)): lower = l - window_size if l - window_size >= 0 else None upper = l + window_size + 1 if l + window_size + 1 <= dim else None dims = slice(lower, upper) conts[l][row[dims]] += 1 conts = [zip(*x.items()) for x in conts] conts = [(np.array(c), np.array(cw)) for c, cw in conts] # for i, ((c1, cw1), (c2, cw2)) in enumerate(zip(contss, conts)): # a = np.sort(np.hstack((c1, cw1[:, None])), axis=0) # b = np.sort(np.hstack((c2, cw2[:, None])), axis=0) # assert_almost_equal(a, b) return conts
class NaiveBayesLearner(Learner): """ Naive Bayes classifier. Works only with discrete attributes. By default, continuous attributes are discretized. Parameters ---------- preprocessors : list, optional (default="[Orange.preprocess.Discretize]") An ordered list of preprocessors applied to data before training or testing. """ preprocessors = [RemoveNaNColumns(), Discretize()] name = 'naive bayes' def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only categorical variables are " "supported.") cont = contingency.get_contingencies(table) class_freq = np.array( np.diag(contingency.get_contingency(table, table.domain.class_var))) nclss = (class_freq != 0).sum() if not nclss: raise ValueError("Data has no defined target values.") # Laplacian smoothing considers only classes that appear in the data, # in part to avoid cases where the probabilities are affected by empty # (or completely spurious) classes that appear because of Orange's reuse # of variables. See GH-2943. # The corresponding elements of class_probs are set to zero only after # mock non-zero values are used in computation of log_cont_prob to # prevent division by zero. class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss) log_cont_prob = [ np.log((np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss) / class_prob[:, None]) for c in cont ] class_prob[class_freq == 0] = 0 return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
class TestCA(unittest.TestCase): def test_init(self): res = Results(nmethods=2, nrows=100) res.actual[:50] = 0 res.actual[50:] = 1 res.predicted = np.vstack((res.actual, res.actual)) np.testing.assert_almost_equal(CA(res), [1, 1]) res.predicted[0][0] = 1 np.testing.assert_almost_equal(CA(res), [0.99, 1]) res.predicted[1] = 1 - res.predicted[1] np.testing.assert_almost_equal(CA(res), [0.99, 0]) def test_call(self): res = Results(nmethods=2, nrows=100) res.actual[:50] = 0 res.actual[50:] = 1 res.predicted = np.vstack((res.actual, res.actual)) ca = CA() np.testing.assert_almost_equal(ca(res), [1, 1]) res.predicted[0][0] = 1 np.testing.assert_almost_equal(ca(res), [0.99, 1]) res.predicted[1] = 1 - res.predicted[1] np.testing.assert_almost_equal(ca(res), [0.99, 0]) def test_bayes(self): x = np.random.randint(2, size=(100, 5)) col = np.random.randint(5) y = x[:, col].copy().reshape(100, 1) t = Table(x, y) t = Discretize( method=discretize.EqualWidth(n=3))(t) nb = NaiveBayesLearner() res = TestOnTrainingData(t, [nb]) np.testing.assert_almost_equal(CA(res), [1]) t.Y[-20:] = 1 - t.Y[-20:] res = TestOnTrainingData(t, [nb]) self.assertGreaterEqual(CA(res)[0], 0.75) self.assertLess(CA(res)[0], 1)
def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] prob = self.controls.show_prob prob.clear() prob.addItem("(None)") prob.addItems(self.cvar.values) prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: domain = Orange.data.Domain( [self.var, self.cvar] if self.cvar else [self.var]) data = Orange.data.Table(domain, data) disc = EqualWidth(n=self.bins[self.smoothing_index]) data = Discretize(method=disc, remove_const=False)(data) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() self.controls.cumulative_distr.setDisabled(not self.var.is_continuous) if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange()
def test_discrete_features(self): combo = self.widget.controls._feature model = combo.model() disc_housing = Discretize()(self.housing) self.send_signal(self.widget.Inputs.data, disc_housing) self.assertEqual(model.rowCount(), 1) self.assertTrue(self.widget.Error.data_error.is_shown()) continuizer = Continuize() self.send_signal(self.widget.Inputs.preprocessor, continuizer) self.assertGreater(model.rowCount(), 1) self.assertFalse(self.widget.Error.data_error.is_shown()) self.send_signal(self.widget.Inputs.preprocessor, None) self.assertEqual(model.rowCount(), 1) self.assertTrue(self.widget.Error.data_error.is_shown()) self.send_signal(self.widget.Inputs.data, None) self.assertEqual(model.rowCount(), 1) self.assertFalse(self.widget.Error.data_error.is_shown())
def setUp(self): self.iris = Table("iris") self.adult = Table("adult") self.discretizer = Discretize(EqualFreq(n=3))