def test_hoeffding_anytime_tree_coverage(): # Cover memory management stream = SEAGenerator(random_state=1, noise_percentage=0.05) stream.prepare_for_use() X, y = stream.next_sample(15000) learner = HATT(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='nba') learner.partial_fit(X, y, classes=stream.target_values) learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) stream.prepare_for_use() X, y = stream.next_sample(15000) learner = HATT(leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)]) learner.partial_fit(X, y, classes=stream.target_values)
def test_hoeffding_anytime_tree_nba(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) stream.prepare_for_use() learner = HATT(nominal_attributes=[i for i in range(1, 9)]) cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 ]) assert np.alltrue(predictions == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_anytime_tree.npy') expected_proba = np.load(test_file)[:49, :] assert np.allclose(proba_predictions, expected_proba) expected_info = "HATT(binary_split=False, grace_period=200, leaf_prediction='nba',\n" \ " max_byte_size=33554432, memory_estimate_period=1000000,\n" \ " min_samples_reevaluate=20, nb_threshold=0,\n" \ " nominal_attributes=[1, 2, 3, 4, 5, 6, 7, 8], split_confidence=1e-07,\n" \ " split_criterion='info_gain', stop_mem_management=False,\n" \ " tie_threshold=0.05)" assert learner.get_info() == expected_info expected_model = 'ifAttribute1=0.0:ifAttribute3=0.0:Leaf=Class1|{0:260.0,1:287.0}' \ 'ifAttribute3=1.0:Leaf=Class0|{0:163.0,1:117.0}ifAttribute1=1.0:Leaf=Class0|{0:718.0,1:495.0}' assert (learner.get_model_description().replace("\n", " ").replace( " ", "") == expected_model.replace(" ", "")) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_hoeffding_anytime_tree_nb_gini(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) stream.prepare_for_use() learner = HATT(nominal_attributes=[i for i in range(1, 9)], leaf_prediction='nb', split_criterion='gini') cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0 ]) assert np.alltrue(predictions == expected_predictions) expected_info = "HATT(binary_split=False, grace_period=200, leaf_prediction='nb',\n" \ " max_byte_size=33554432, memory_estimate_period=1000000,\n" \ " min_samples_reevaluate=20, nb_threshold=0,\n" \ " nominal_attributes=[1, 2, 3, 4, 5, 6, 7, 8], split_confidence=1e-07,\n" \ " split_criterion='gini', stop_mem_management=False, tie_threshold=0.05)" assert learner.get_info() == expected_info
def basic_simulation(): model = HATT() print("EFDT Basic Sim") num_choices = 2 click_gen = de.default_sex_age_click_gen() de.test_default_sex_age_click_gen() batch_size = 1000 batch_count = 10 batches = de.get_batches(batch_size, batch_count) choice = 0 for i, batch in enumerate(batches): choices = [choice] * len(batch) clicks = [click_gen.get_click(visit, choice) for visit,choice in zip(batch, choices)] print_clicks(de.types, i, clicks, batch, choices) tim = time.time() model.partial_fit(batch, clicks) print("fit time: ", time.time() - tim) print("--Model: ", choice) print_predictions(model)
def test_hoeffding_anytime_tree(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) stream.prepare_for_use() learner = HATT(nominal_attributes=[i for i in range(1, 9)]) cnt = 0 max_samples = 15000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1 ]) test_file = os.path.join(test_path, 'test_hoeffding_anytime_tree.npy') data_prob = np.load(test_file) assert np.alltrue(predictions == expected_predictions) assert np.alltrue(proba_predictions == data_prob) expected_info = 'HATT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200 - ' \ 'min_samples_reevaluate: 20 - split_criterion: info_gain - split_confidence: 1e-07 - ' \ 'tie_threshold: 0.05 - binary_split: False - stop_mem_management: False - leaf_prediction: ' \ 'nba - nb_threshold: 0 - nominal_attributes: [1, 2, 3, 4, 5, 6, 7, 8] - ' assert learner.get_info() == expected_info expected_model = 'if Attribute 1 = 0: if Attribute 3 = 0: Leaf = Class 1 | {0: 896.0, 1: 947.0} ' \ 'if Attribute 3 = 1: Leaf = Class 0 | {0: 500.0, 1: 388.0} if Attribute 1 = 1: ' \ 'if Attribute 5 = 0: Leaf = Class 0 | {0: 404.0, 1: 259.0} if Attribute 5 = 1: ' \ 'Leaf = Class 0 | {0: 166.0, 1: 82.0}' assert (learner.get_model_description().replace("\n", " ").replace( " ", "") == expected_model.replace(" ", "")) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
class Tree(): def check_input(self, kwargs): if all(k in kwargs for k in ['tk', 'provider']): return isinstance(kwargs['tk'], tkinter.Tk) and isinstance( kwargs['provider'], DataProvider) def __init__(self, **kwargs): if not self.check_input(kwargs): raise ValueError( 'Tree expects two arguments: \'tk\', a Tkinter object, and \'provider\' a DataProvider object' ) # save arguments self.tk = kwargs['tk'] self.provider = kwargs['provider'] # subscribe to algorithm and listeners lists self.provider.subscribe_to_listening_list(self) self.provider.subscribe_to_algorithm_list(self) # list to save all tree states (store root nodes) self.history = [] self.current_timestamp = -1 # get the actual ExtremlyFastDecisionTreeClassifier here self.tree = TreeClass() # master frame self.frame = tkinter.Frame(self.tk, highlightthickness=5, highlightbackground='black', bd=0) self.frame.pack(side=tkinter.TOP, fill=tkinter.BOTH) def _save_tree_state(self): self.history.append(Node(self.tree._tree_root)) def notify(self): # destroy previous widgets if (self.current_timestamp != -1): self.history[self.current_timestamp].destroy() # update 'self.current_timestamp' and build widgets for the new timestamp self.current_timestamp = self.provider.get_timestamp() self.history[self.current_timestamp].build_widget(self.frame) def train(self): # just save the new tree state to 'self.history', 'notify' will take care of the widget # train tree using current object obj = self.provider.get_current_object() self.tree.partial_fit(obj['x'], obj['y']) # save new tree self._save_tree_state()