def test_hoeffding_tree_coverage(): # Cover memory management stream = SEAGenerator(random_state=1, noise_percentage=0.05) stream.prepare_for_use() X, y = stream.next_sample(5000) learner = HoeffdingTree(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='mc') learner.partial_fit(X, y, classes=stream.target_values) learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0, n_categories_per_cat_feature=2) stream.prepare_for_use() X, y = stream.next_sample(1000) learner = HoeffdingTree(leaf_prediction='mc', nominal_attributes=[i for i in range(10)]) learner.partial_fit(X, y, classes=stream.target_values)
def test_hoeffding_tree_model_information(): stream = SEAGenerator(random_state=1, noise_percentage=0.05) stream.prepare_for_use() X, y = stream.next_sample(5000) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=stream.target_values) expected_info = { 'Tree size (nodes)': 5, 'Tree size (leaves)': 3, 'Active learning nodes': 3, 'Tree depth': 2, 'Active leaf byte size estimate': 0.0, 'Inactive leaf byte size estimate': 0.0, 'Byte size estimate overhead': 1.0 } observed_info = learner.get_model_measurements for k in expected_info: assert k in observed_info assert expected_info[k] == observed_info[k] expected_description = "if Attribute 0 <= 4.549969620513424:\n" \ " if Attribute 1 <= 5.440182925299016:\n" \ " Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \ " if Attribute 1 > 5.440182925299016:\n" \ " Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \ "if Attribute 0 > 4.549969620513424:\n" \ " Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \ assert expected_description == learner.get_model_description()
def demo(): # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD) h1 = [ HoeffdingTree(), SAMKNN(), LeverageBagging(random_state=1), SGDClassifier() ] h2 = [ HoeffdingTree(), SAMKNN(), LeverageBagging(random_state=1), SGDClassifier() ] h3 = [ HoeffdingTree(), SAMKNN(), LeverageBagging(random_state=1), SGDClassifier() ] model_names = ['HT', 'SAMKNN', 'LBkNN', 'SGDC'] # Demo 1 -- plot should not fail demo_parameterized(h1, model_names=model_names) # Demo 2 -- csv output should look nice demo_parameterized(h2, "sea_stream.csv", False, model_names) # Demo 3 -- should not give "'NoneType' object is not iterable" error demo_parameterized(h3, "covtype.csv", False, model_names)
def test_hoeffding_tree_categorical_features(test_path): data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy') stream = np.load(data_path) # Removes the last two columns (regression targets) stream = stream[:, :-2] X, y = stream[:, :-1], stream[:, -1] nominal_attr_idx = np.arange(7).tolist() learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=np.unique(y)) expected_description = "if Attribute 0 = -15.0:\n" \ " Leaf = Class 2 | {2: 350.0}\n" \ "if Attribute 0 = 0.0:\n" \ " Leaf = Class 0 | {0: 420.0, 1: 252.0}\n" \ "if Attribute 0 = 1.0:\n" \ " Leaf = Class 1 | {0: 312.0, 1: 332.0}\n" \ "if Attribute 0 = 2.0:\n" \ " Leaf = Class 1 | {0: 236.0, 1: 383.0}\n" \ "if Attribute 0 = 3.0:\n" \ " Leaf = Class 1 | {0: 168.0, 1: 459.0}\n" \ "if Attribute 0 = -30.0:\n" \ " Leaf = Class 3.0 | {3.0: 46.0, 4.0: 42.0}\n" assert learner.get_model_description() == expected_description
def __init__(self, grace_period=200, split_confidence=0.5, leaf_prediction='nba', split_criterion='info_gain'): super().__init__() self.clf = HoeffdingTree(split_confidence=split_confidence, grace_period=grace_period, leaf_prediction=leaf_prediction, split_criterion=split_criterion)
def test_hoeffding_tree(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) cnt = 0 max_samples = 5000 predictions = array('d') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('d', [ 0.0, 1.0, 3.0, 0.0, 0.0, 3.0, 0.0, 1.0, 1.0, 2.0, 0.0, 2.0, 1.0, 1.0, 2.0, 1.0, 3.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 3.0, 1.0, 2.0, 1.0, 1.0, 3.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 2.0 ]) test_file = os.path.join(test_path, 'test_hoeffding_tree.npy') data = np.load(test_file) assert np.alltrue(predictions == expected_predictions) assert np.allclose(proba_predictions, data) expected_info = 'HoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200 ' \ '- split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05 ' \ '- binary_split: False - stop_mem_management: False - remove_poor_atts: False ' \ '- no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0 - nominal_attributes: [5, 6, 7,' \ ' 8, 9, 10, 11, 12, 13, 14] - ' assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1423.0, 1.0: 1745.0, 2.0: 978.0, 3.0: 854.0}\n' expected_model_2 = 'Leaf = Class 1.0 | {1.0: 1745.0, 2.0: 978.0, 0.0: 1423.0, 3.0: 854.0}\n' assert (learner.get_model_description() == expected_model_1) \ or (learner.get_model_description() == expected_model_2)
def _choose_classifier(job: Job): if job.type == JobTypes.UPDATE.value: classifier = _load_model(job.incremental_train) # TODO: check if this instruction still makes sense # are we updating a predictive_model with its own methods? assert classifier[0].__class__.__name__ == job.method else: method, config = get_method_config(job) config.pop('classification_method', None) print("Using method {} with config {}".format(method, config)) if method == ClassificationMethods.KNN.value: classifier = KNeighborsClassifier(**config) elif method == ClassificationMethods.RANDOM_FOREST.value: classifier = RandomForestClassifier(**config) elif method == ClassificationMethods.DECISION_TREE.value: classifier = DecisionTreeClassifier(**config) elif method == ClassificationMethods.XGBOOST.value: classifier = XGBClassifier(**config) elif method == ClassificationMethods.MULTINOMIAL_NAIVE_BAYES.value: classifier = MultinomialNB(**config) elif method == ClassificationMethods.ADAPTIVE_TREE.value: classifier = HAT(**config) elif method == ClassificationMethods.HOEFFDING_TREE.value: classifier = HoeffdingTree(**config) elif method == ClassificationMethods.SGDCLASSIFIER.value: classifier = SGDClassifier(**config) elif method == ClassificationMethods.PERCEPTRON.value: classifier = Perceptron(**config) elif method == ClassificationMethods.NN.value: config['encoding'] = job.encoding.value_encoding config['is_binary_classifier'] = _check_is_binary_classifier(job.labelling.type) classifier = NNClassifier(**config) else: raise ValueError("Unexpected classification method {}".format(method)) return classifier
def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts=False, found_nodes=None): if found_nodes is None: found_nodes = [] if update_splitter_counts: try: self._observed_class_distribution[ y] += weight # Dictionary (class_value, weight) except KeyError: self._observed_class_distribution[y] = weight child_index = self.instance_child_index(X) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes) else: found_nodes.append( HoeffdingTree.FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves( X, y, weight, self, -999, update_splitter_counts, found_nodes)
def test_evaluate_classification_coverage(tmpdir): # A simple coverage test. Tests for metrics are placed in the corresponding test module. stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) max_samples = 1000 output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = [ 'accuracy', 'kappa', 'kappa_t', 'kappa_m', 'f1', 'precision', 'recall', 'gmean', 'true_vs_predicted' ] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate evaluator.evaluate(stream=stream, model=learner) mean_performance, current_performance = evaluator.get_measurements( model_idx=0) expected_current_accuracy = 0.685 assert np.isclose(current_performance.accuracy_score(), expected_current_accuracy)
def _choose_classifier(job: Job): method, config = get_method_config(job) config.pop('classification_method', None) logger.info("Using method {} with config {}".format(method, config)) if method == ClassificationMethods.KNN.value: classifier = KNeighborsClassifier(**config) elif method == ClassificationMethods.RANDOM_FOREST.value: classifier = RandomForestClassifier(**config) elif method == ClassificationMethods.DECISION_TREE.value: classifier = DecisionTreeClassifier(**config) elif method == ClassificationMethods.XGBOOST.value: classifier = XGBClassifier(**config) elif method == ClassificationMethods.MULTINOMIAL_NAIVE_BAYES.value: classifier = MultinomialNB(**config) elif method == ClassificationMethods.ADAPTIVE_TREE.value: classifier = HAT(**config) elif method == ClassificationMethods.HOEFFDING_TREE.value: classifier = HoeffdingTree(**config) elif method == ClassificationMethods.SGDCLASSIFIER.value: classifier = SGDClassifier(**config) elif method == ClassificationMethods.PERCEPTRON.value: classifier = Perceptron(**config) elif method == ClassificationMethods.NN.value: config['encoding'] = job.encoding.value_encoding config['is_binary_classifier'] = _check_is_binary_classifier( job.labelling.type) classifier = NNClassifier(**config) else: raise ValueError("Unexpected classification method {}".format(method)) return classifier
def test_evaluate_prequential_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator max_samples = 1000 metrics = ['kappa', 'kappa_t', 'performance'] output_file = os.path.join(str(tmpdir), "prequential_summary.csv") evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'prequential_summary.csv') compare_files(output_file, expected_file)
def main(): # start agent network server agentNetwork = AgentNetwork() # init agents gen_agent = agentNetwork.add_agent(agentType=DataStreamAgent) trainer_agent = agentNetwork.add_agent(agentType=Trainer) predictor_agent = agentNetwork.add_agent(agentType=Predictor) evaluator_agent = agentNetwork.add_agent(agentType=Evaluator) monitor_agent_1 = agentNetwork.add_agent(agentType=MonitorAgent) monitor_agent_2 = agentNetwork.add_agent(agentType=MonitorAgent) gen_agent.init_parameters(stream=SineGenerator(), pretrain_size=1000, batch_size=1) trainer_agent.init_parameters(ml_model=HoeffdingTree()) # connect agents : We can connect multiple agents to any particular agent # However the agent needs to implement handling multiple input types agentNetwork.bind_agents(gen_agent, trainer_agent) agentNetwork.bind_agents(gen_agent, predictor_agent) agentNetwork.bind_agents(trainer_agent, predictor_agent) agentNetwork.bind_agents(predictor_agent, evaluator_agent) agentNetwork.bind_agents(evaluator_agent, monitor_agent_1) agentNetwork.bind_agents(predictor_agent, monitor_agent_2) # set all agents states to "Running" agentNetwork.set_running_state() # allow for shutting down the network after execution return agentNetwork
class VFDT(IncrementalClassifier): def __init__(self, grace_period=200, split_confidence=0.5, leaf_prediction='nba', split_criterion='info_gain'): super().__init__() self.clf = HoeffdingTree(split_confidence=split_confidence, grace_period=grace_period, leaf_prediction=leaf_prediction, split_criterion=split_criterion) def partial_fit(self, one_row): self.clf.partial_fit([one_row[0]], [one_row[1]]) def predict(self, x): return self.clf.predict(x)
def test_evaluate_prequential_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator max_samples = 1000 metrics = ['accuracy', 'kappa', 'kappa_t'] output_file = os.path.join(str(tmpdir), "prequential_summary.csv") evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'prequential_summary.csv') compare_files(output_file, expected_file) mean_performance, current_performance = evaluator.get_measurements(model_idx=0) expected_mean_accuracy = 0.436250 assert np.isclose(mean_performance.get_accuracy(), expected_mean_accuracy) expected_mean_kappa = 0.231791 assert np.isclose(mean_performance.get_kappa(), expected_mean_kappa) expected_mean_kappa_t = 0.236887 assert np.isclose(mean_performance.get_kappa_t(), expected_mean_kappa_t) expected_current_accuracy = 0.430000 assert np.isclose(current_performance.get_accuracy(), expected_current_accuracy) expected_current_kappa = 0.223909 assert np.isclose(current_performance.get_kappa(), expected_current_kappa) expected_current_kappa_t = 0.240000 assert np.isclose(current_performance.get_kappa_t(), expected_current_kappa_t) expected_info = "EvaluatePrequential(batch_size=1, data_points_for_classification=False,\n" \ " max_samples=1000, max_time=inf,\n" \ " metrics=['accuracy', 'kappa', 'kappa_t'], n_wait=200,\n" \ " output_file='prequential_summary.csv',\n" \ " pretrain_size=200, restart_stream=True, show_plot=False)" assert evaluator.get_info() == expected_info
def run_indefinetly(input_topic, output_topic, target_index, model=HoeffdingTree()): print(f'Running AutoML for input_topic={input_topic}, output_topic={output_topic}, target_index={target_index} and broker={BOOTSTRAP_SERVERS}.') consumer = KafkaConsumer( input_topic, bootstrap_servers=BOOTSTRAP_SERVERS, group_id=None, auto_offset_reset='earliest', value_deserializer=lambda x: x.decode('utf-8') ) producer = KafkaProducer(bootstrap_servers=BOOTSTRAP_SERVERS, value_serializer=lambda x: x.encode('utf-8')) i = 0 total_predictions = 0 correct_predictions = 0 accuracy = 0 for message in consumer: sample = pd.read_csv(StringIO(message.value), header=None) i += 1 if any(sample.dtypes == 'object'): print(f'Streamed sample contains text or malformatted data.') continue X = sample.iloc[:,:target_index] y = sample.iloc[:,target_index] # Collect metrics try: prediction = model.predict(X) total_predictions += 1 if prediction[0] == y[0]: correct_predictions += 1 accuracy = correct_predictions / total_predictions print(f'Accuracy at sample {i}: {accuracy}') producer.send(output_topic + '__accuracy', str(accuracy)) producer.send(output_topic + '__pred_count', str(total_predictions)) producer.flush() except Exception: pass if y.isnull().any(): # Predict try: y_pred = pd.DataFrame(model.predict(X)) producer.send(output_topic, y_pred.to_csv(header=False, index=False)) producer.flush() except Exception as e: print('An exception occured during prediction', e) else: # Train try: model.partial_fit(X, y) except Exception as e: print('An exception occured during training', e)
def test_evaluate_holdout_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator n_wait = 200 max_samples = 1000 metrics = ['accuracy', 'kappa', 'kappa_t'] output_file = os.path.join(str(tmpdir), "holdout_summary.csv") evaluator = EvaluateHoldout(n_wait=n_wait, max_samples=max_samples, test_size=50, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'holdout_summary.csv') compare_files(output_file, expected_file) mean_performance, current_performance = evaluator.get_measurements( model_idx=0) expected_mean_accuracy = 0.344000 expected_mean_kappa = 0.135021 expected_mean_kappa_t = 0.180000 expected_current_accuracy = 0.360000 expected_current_kappa = 0.152542 expected_current_kappa_t = 0.200000 assert np.isclose(mean_performance.get_accuracy(), expected_mean_accuracy) assert np.isclose(mean_performance.get_kappa(), expected_mean_kappa) assert np.isclose(mean_performance.get_kappa_t(), expected_mean_kappa_t) assert np.isclose(current_performance.get_accuracy(), expected_current_accuracy) assert np.isclose(current_performance.get_kappa(), expected_current_kappa) assert np.isclose(current_performance.get_kappa_t(), expected_current_kappa_t)
def __init__(self, estimator=HoeffdingTree(leaf_prediction='nb'), weight_mc=10, weight_inv=0.3, max_session_size=20): super().__init__() self.ht = estimator self.w_mc = weight_mc self.w_inv = weight_inv self.counter = Counter() self.max_session_size = max_session_size self._rec_tracker = defaultdict(list)
def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] found_nodes.append( HoeffdingTree.FoundNode(self, parent, parent_branch))
def init_parameters(self, mode="prequential", ml_model=HoeffdingTree(), split_type=None): self.mode = mode self.ml_model = ml_model self.results = [] if split_type is not None: self.split_type = split_type else: self.split_type = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
def __init__(self, model=HoeffdingTree(), random_state=None): super().__init__() self.classes = None self._alpha = 0 self._model = model self._majority_cutoff = 1 self._training_set_X = [] self._training_set_y = [] self._batch_num = 1 self._past_instances = {} self._original_random_state = random_state self.random_state = None
def __init__(self, classes, model=HoeffdingTree(), random_state=None): super().__init__() self._classes = classes self._alpha = 1 / len(self._classes) self._model = model self._majority_cutoff = 1 self._training_set_X = [] self._training_set_y = [] self._num_instance_per_class = {} self._batch_num = 1 self._original_random_state = random_state self.random_state = None for var in self._classes: self._num_instance_per_class[var] = 0
def demo(): # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD) h = HoeffdingTree() # Setup Stream stream = FileStream("../data/datasets/sea_stream.csv") stream.prepare_for_use() pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_filestream.csv', max_samples=10000, batch_size=1, n_wait=1000, show_plot=True) evaluator.evaluate(stream=stream, model=h)
def test_evaluate_classification_metrics(): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) max_samples = 1000 metrics = ['f1', 'precision', 'recall', 'gmean'] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics) # Evaluate evaluator.evaluate(stream=stream, model=learner) mean_performance, current_performance = evaluator.get_measurements(model_idx=0) expected_current_f1_score = 0.7096774193548387 expected_current_precision = 0.6814159292035398 expected_current_recall = 0.7403846153846154 expected_current_g_mean = 0.6802502367624613 expected_mean_f1_score = 0.7009803921568628 expected_mean_precision = 0.7185929648241206 expected_mean_recall = 0.6842105263157895 expected_mean_g_mean = 0.6954166367760247 print(mean_performance.get_g_mean()) print(mean_performance.get_recall()) print(mean_performance.get_precision()) print(mean_performance.get_f1_score()) print(current_performance.get_g_mean()) print(current_performance.get_recall()) print(current_performance.get_precision()) print(current_performance.get_f1_score()) assert np.isclose(current_performance.get_f1_score(), expected_current_f1_score) assert np.isclose(current_performance.get_precision(), expected_current_precision) assert np.isclose(current_performance.get_recall(), expected_current_recall) assert np.isclose(current_performance.get_g_mean(), expected_current_g_mean) assert np.isclose(mean_performance.get_f1_score(), expected_mean_f1_score) assert np.isclose(mean_performance.get_precision(), expected_mean_precision) assert np.isclose(mean_performance.get_recall(), expected_mean_recall) assert np.isclose(mean_performance.get_g_mean(), expected_mean_g_mean)
def demo(output_file=None, instances=40000): """ _test_comparison_holdout This demo will test a holdout evaluation task when more than one learner is evaluated, which makes it a comparison task. Parameters ---------- output_file: string, optional If passed this parameter indicates the output file name. If left blank, no output file will be generated. instances: int (Default: 40000) The evaluation's maximum number of instances. """ # Setup the File Stream # stream = FileStream("../data/datasets/covtype.csv", -1, 1) stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier clf_one = HoeffdingTree() # clf_two = KNNAdwin(n_neighbors=8, max_window_size=2000) # classifier = PassiveAggressiveClassifier() # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline classifier = [clf_one] # Setup the evaluator evaluator = EvaluateHoldout(test_size=500, dynamic_test_set=True, max_samples=instances, batch_size=1, n_wait=5000, max_time=1000, output_file=output_file, show_plot=True, metrics=['kappa']) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def test_hoeffding_tree_nb(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx, leaf_prediction='nb') cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3, 2 ]) assert np.alltrue(predictions == expected_predictions) expected_info = "HoeffdingTree(binary_split=False, grace_period=200, leaf_prediction='nb',\n" \ " max_byte_size=33554432, memory_estimate_period=1000000,\n" \ " nb_threshold=0, no_preprune=False,\n" \ " nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14],\n" \ " remove_poor_atts=False, split_confidence=1e-07,\n" \ " split_criterion='info_gain', stop_mem_management=False,\n" \ " tie_threshold=0.05)" assert learner.get_info() == expected_info
def demo(): """ _test_pipeline This demo demonstrates the Pipeline structure seemingly working as a learner, while being passed as parameter to an EvaluatePrequential object. """ # # Setup the stream # stream = FileStream("../data/datasets/covtype.csv", -1, 1) # stream.prepare_for_use() # # If used for Hoeffding Trees then need to pass indices for Nominal attributes # Test with RandomTreeGenerator # stream = RandomTreeGenerator(n_classes=2, n_numerical_attributes=5) # stream.prepare_for_use() # Test with WaveformGenerator stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier #classifier = PerceptronMask() #classifier = NaiveBayes() #classifier = PassiveAggressiveClassifier() classifier = HoeffdingTree() # Setup the pipeline pipe = Pipeline([('Hoeffding Tree', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(show_plot=True, pretrain_size=1000, max_samples=100000) # Evaluate evaluator.evaluate(stream=stream, model=pipe)
def simulation(): """ Simulation webpage Returns ------- webpage: string. Html of the simulation webpage. """ global thread global stream_stop_event global stream_pause_event thread = Thread() stream_stop_event = Event() stream_stop_event.set() stream_pause_event = Event() dataset = request.args.get('dataset') + ".data" print("DATASET:", dataset) model_name = request.args.get('model') if model_name == "NaiveBayes": model = NaiveBayes() elif model_name == "VFDR": model = VFDR(ordered_rules=False, rule_prediction="weighted_sum", drift_detector=None) else: model = HoeffdingTree() freq = request.args.get('freq') alpha = request.args.get('alpha') beta = request.args.get('beta') buffer = True if request.args.get('buffer') == "on" else False xmax = pd.read_csv(BASE_DIR + dataset).shape[0] + 1 thread = socketio.start_background_task(spc_method, dataset, model, int(alpha), int(beta), buffer, int(freq)) plot = create_plot(model_name, xmax) return render_template('simulation.html', plot=plot)
def main(): global agentNetwork # start agent network agentNetwork = AgentNetwork() # add agents data_stream_agent_1 = agentNetwork.add_agent(agentType=DataStreamAgent) ml_agent_hoeffdingTree = agentNetwork.add_agent(agentType=ML_Model) ml_agent_neuralNets = agentNetwork.add_agent(agentType=ML_Model) monitor_agent_1 = agentNetwork.add_agent(agentType=MonitorAgent) # init parameters data_stream_agent_1.init_parameters(stream=WaveformGenerator(), pretrain_size=1000, batch_size=100) ml_agent_hoeffdingTree.init_parameters(ml_model=HoeffdingTree()) ml_agent_neuralNets.init_parameters(ml_model=NaiveBayes()) # connect agents agentNetwork.bind_agents(data_stream_agent_1, ml_agent_hoeffdingTree) agentNetwork.bind_agents(data_stream_agent_1, ml_agent_neuralNets) agentNetwork.bind_agents(ml_agent_hoeffdingTree, monitor_agent_1) agentNetwork.bind_agents(ml_agent_neuralNets, monitor_agent_1) agentNetwork.set_running_state() # allow for shutting down the network after execution return agentNetwork
from strlearn.evaluators import TestThenTrain from sklearn.naive_bayes import GaussianNB from strlearn.metrics import (balanced_accuracy_score, f1_score, geometric_mean_score_1, precision, recall, specificity) import sys from sklearn.base import clone from sklearn.tree import DecisionTreeClassifier from skmultiflow.trees import HoeffdingTree # Select streams and methods streams = h.realstreams() print(len(streams)) ob = OnlineBagging(n_estimators=20, base_estimator=HoeffdingTree(split_criterion='hellinger')) oob = OOB(n_estimators=20, base_estimator=HoeffdingTree(split_criterion='hellinger')) uob = UOB(n_estimators=20, base_estimator=HoeffdingTree(split_criterion='hellinger')) ros_knorau2 = SEA(base_estimator=StratifiedBagging( base_estimator=HoeffdingTree(split_criterion='hellinger'), random_state=42, oversampler="ROS"), oversampled="ROS", des="KNORAU2") cnn_knorau2 = SEA(base_estimator=StratifiedBagging( base_estimator=HoeffdingTree(split_criterion='hellinger'), random_state=42, oversampler="CNN"), oversampled="CNN",
def init_parameters(self, ml_model=HoeffdingTree()): self.ml_model = ml_model