def test_hoeffding_tree(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) cnt = 0 max_samples = 5000 predictions = array('d') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if cnt % wait_samples == 0: predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('d', [0.0, 0.0, 1.0, 3.0, 0.0, 0.0, 3.0, 0.0, 1.0, 1.0, 2.0, 0.0, 2.0, 1.0, 1.0, 2.0, 1.0, 3.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 3.0, 1.0, 2.0, 1.0, 1.0, 3.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 2.0]) test_file = os.path.join(test_path, 'test_hoeffding_tree.npz') data = np.load(test_file) expected_proba_predictions_0 = data["a"] expected_proba_predictions_1 = data["b"] assert np.alltrue(predictions == expected_predictions) assert np.alltrue(proba_predictions == expected_proba_predictions_0) or \ np.alltrue(proba_predictions == expected_proba_predictions_1) assert np.alltrue(predictions == expected_predictions) expected_info = 'HoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200 ' \ '- split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05 ' \ '- binary_split: False - stop_mem_management: False - remove_poor_atts: False ' \ '- no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0 - nominal_attributes: [5, 6, 7,' \ ' 8, 9, 10, 11, 12, 13, 14] - ' assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1423.0, 1.0: 1745.0, 2.0: 978.0, 3.0: 854.0}\n' expected_model_2 = 'Leaf = Class 1.0 | {1.0: 1745.0, 2.0: 978.0, 0.0: 1423.0, 3.0: 854.0}\n' assert (learner.get_model_description() == expected_model_1) \ or (learner.get_model_description() == expected_model_2)
def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts=False, found_nodes=None): if found_nodes is None: found_nodes = [] if update_splitter_counts: try: self._observed_class_distribution[ y] += weight # Dictionary (class_value, weight) except KeyError: self._observed_class_distribution[y] = weight child_index = self.instance_child_index(X) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes) else: found_nodes.append( HoeffdingTree.FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves( X, y, weight, self, -999, update_splitter_counts, found_nodes)
def test_evaluate_prequential_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator max_samples = 1000 metrics = ['kappa', 'kappa_t', 'performance'] output_file = os.path.join(str(tmpdir), "prequential_summary.csv") evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'prequential_summary.csv') compare_files(output_file, expected_file)
def filter_instance_to_leaves(self, X, split_parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] found_nodes.append( HoeffdingTree.FoundNode(self, split_parent, parent_branch))
def demo(): # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD) h = [HoeffdingTree(), SAMKNN(), LeverageBagging(), SGDClassifier()] # Demo 1 -- plot should not fail demo_parameterized(h) # Demo 2 -- csv output should look nice demo_parameterized(h, "sea_stream.csv", False) # Demo 3 -- should not give "'NoneType' object is not iterable" error demo_parameterized(h, "covtype.csv", False)
def test_hoeffding_tree(): stream = RandomTreeGenerator(tree_seed=23, instance_seed=12, n_classes=4, n_nominal_attributes=2, n_numerical_attributes=5, n_values_per_nominal=5, max_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() nominal_attr_idx = [ x for x in range(15, len(stream.get_attributes_header())) ] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) cnt = 0 max_samples = 5000 predictions = array('d') wait_samples = 100 while cnt < max_samples: X, y = stream.next_instance() # Test every n samples if cnt % wait_samples == 0: predictions.append(learner.predict(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('d', [ 0.0, 3.0, 2.0, 1.0, 1.0, 2.0, 0.0, 2.0, 0.0, 3.0, 3.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 3.0, 0.0, 1.0, 1.0, 0.0, 2.0, 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 2.0, 0.0, 2.0, 2.0, 0.0, 1.0, 1.0, 3.0, 1.0, 0.0, 3.0, 0.0, 1.0, 1.0, 0.0, 0.0 ]) assert np.alltrue(predictions == expected_predictions) expected_info = 'HoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200 ' \ '- split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05 ' \ '- binary_split: False - stop_mem_management: False - remove_poor_atts: False ' \ '- no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0 - nominal_attributes: [] - ' assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1384.0, 1.0: 1720.0, 2.0: 1005.0, 3.0: 891.0}\n' expected_model_2 = 'Leaf = Class 1.0 | {1.0: 1720.0, 2.0: 1005.0, 0.0: 1384.0, 3.0: 891.0}\n' assert (learner.get_model_description() == expected_model_1) \ or (learner.get_model_description() == expected_model_2)
def demo(output_file=None, instances=40000): """ _test_prequential_bagging This demo shows the evaluation process of a LeverageBagging classifier, initialized with KNN classifiers. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream # opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_big.csv", "CSV", False) # stream = FileStream(opt, -1, 1) stream = SEAGenerator(classification_function=2, instance_seed=755437, noise_percentage=0.0) stream.prepare_for_use() # Setup the classifier #classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None)) #classifier = LeverageBagging(h=KNN(k=8, max_window_size=2000, leaf_size=30), ensemble_length=1) classifier = LeverageBagging(h=HoeffdingTree(), ensemble_length=2) # Setup the pipeline pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator eval = EvaluatePrequential( pretrain_size=2000, max_instances=instances, batch_size=1, n_wait=200, max_time=1000, output_file=output_file, task_type='classification', show_plot=False, plot_options=['kappa', 'kappa_t', 'performance']) # Evaluate eval.eval(stream=stream, classifier=pipe)
def demo(): # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD) h = HoeffdingTree() # Setup Stream stream = FileStream("../datasets/sea_stream.csv", -1, 1) stream.prepare_for_use() pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='output.csv', max_samples=10000, batch_size=1, n_wait=1000, show_plot=True, metrics=['performance']) evaluator.evaluate(stream=stream, model=h)
def demo(): # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD) h = HoeffdingTree() # Setup Stream opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_stream.csv", "CSV", False) stream = FileStream(opt, -1, 1) stream.prepare_for_use() T_init = 100 eval = EvaluatePrequential(pretrain_size=T_init, output_file='output.csv', max_instances=10000, batch_size=1, n_wait=1000, task_type='classification', show_plot=True, plot_options=['performance']) eval.eval(stream=stream, classifier=h)
def demo(output_file=None, instances=40000): """ _test_prequential_bagging This demo shows the evaluation process of a LeverageBagging classifier, initialized with KNN classifiers. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream # stream = FileStream("../datasets/sea_big.csv", -1, 1) #stream = SEAGenerator(classification_function=2, noise_percentage=0.0) #stream.prepare_for_use() stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier #classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None)) #classifier = LeverageBagging(h=KNN(k=8, max_window_size=2000, leaf_size=30), ensemble_length=1) pipe = LeverageBagging(h=HoeffdingTree(), ensemble_length=2) # Setup the pipeline #pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=2000, max_samples=instances, output_file=output_file, show_plot=False) # Evaluate evaluator.evaluate(stream=stream, model=pipe)
def demo(): """ _test_pipeline This demo demonstrates the Pipeline structure seemingly working as a learner, while being passed as parameter to an EvaluatePrequential object. """ # # Setup the stream # opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False) # stream = FileStream(opt, -1, 1) # stream.prepare_for_use() # # If used for Hoeffding Trees then need to pass indices for Nominal attributes # Test with RandomTreeGenerator # stream = RandomTreeGenerator(n_classes=2, n_numerical_attributes=5) # stream.prepare_for_use() # Test with WaveformGenerator stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier #classifier = PerceptronMask() #classifier = NaiveBayes() #classifier = PassiveAggressiveClassifier() classifier = HoeffdingTree() # Setup the pipeline pipe = Pipeline([('Hoeffding Tree', classifier)]) # Setup the evaluator eval = EvaluatePrequential(show_plot=True, pretrain_size=1000, max_instances=100000) # Evaluate eval.eval(stream=stream, classifier=pipe)
def filter_instance_to_leaves(self, X, parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] child_index = self.instance_child_index(X) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(X, parent, parent_branch, update_splitter_counts, found_nodes) else: found_nodes.append( HoeffdingTree.FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves( X, self, -999, update_splitter_counts, found_nodes)
from skmultiflow.options.file_option import FileOption from my_classifier import BatchClassifier dataset = "elec" # 1. Create a stream opt = FileOption("FILE", "OPT_NAME", "./data/" + dataset + ".csv", "CSV", False) stream = FileStream(opt, -1, 1) # 2. Prepare for use stream.prepare_for_use() # 2. Instantiate the HoeffdingTree classifier h = [ KNN(k=10, max_window_size=100, leaf_size=30), HoeffdingTree(), BatchClassifier(window_size=100, max_models=10), ] # 3. Setup the evaluator eval = EvaluatePrequential(pretrain_size=1000, output_file='result_' + dataset + '.csv', max_instances=10000, batch_size=1, n_wait=500, max_time=1000000000, task_type='classification', show_plot=True, plot_options=['performance']) # 4. Run eval.eval(stream=stream, classifier=h)
from skmultiflow.core.pipeline import Pipeline from skmultiflow.data.file_stream import FileStream from skmultiflow.options.file_option import FileOption from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential from skmultiflow.classification.trees.hoeffding_tree import HoeffdingTree # Setup the File Stream #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False) #opt = FileOption("FILE", "OPT_NAME", "../datasets/movingSquares.csv", "CSV", False) opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_stream.csv", "CSV", False) stream = FileStream(opt, -1, 1) stream.prepare_for_use() # Setup the classifiers clf_one = HoeffdingTree() clf_two = AdaptiveRandomForest() # Setup the pipeline for clf_one pipe = Pipeline([('Classifier', clf_one)]) # Create the list to hold both classifiers classifier = [pipe, clf_two] # Setup the evaluator eval = EvaluatePrequential(pretrain_size=200, max_instances=100000, batch_size=1, max_time=1000, output_file='comparison_Hoeffding_ADFH_Preq.csv', task_type='classification',