def test_extremely_fast_decision_tree_coverage(): # Cover memory management max_size_kb = 20 stream = SEAGenerator(random_state=1, noise_percentage=0.05) X, y = get_next_n_samples(stream, 5000) # Unconstrained model has over 50 kB learner = ExtremelyFastDecisionTreeClassifier(leaf_prediction='mc', memory_estimate_period=200, max_byte_size=max_size_kb * 2**10, min_samples_reevaluate=2500) learner.partial_fit(X, y, classes=[0, 1]) assert calculate_object_size(learner, 'kB') <= max_size_kb learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) X, y = get_next_n_samples(stream, 5000) learner = ExtremelyFastDecisionTreeClassifier( leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)]) learner.partial_fit(X, y, classes=[0, 1])
def run_classifier(estimator, stream, pruning=None, ensemble_size=15, m=200): classifier = LearnPPNSEClassifier(base_estimator=estimator, window_size=250, pruning=pruning, slope=0.5, crossing_point=10, n_estimators=ensemble_size) # Keeping track of sample count and correct prediction count sample_count = 0 corrects = 0 # Pre training the classifier with 200 samples X, y = get_next_n_samples(stream, m) classifier.partial_fit(X, y, classes=[0, 1]) for i in range(10): X, y = get_next_n_samples(stream, m) pred = classifier.predict(X) classifier.partial_fit(X, y) if pred is not None: corrects += np.sum(y == pred) sample_count += m acc = corrects / sample_count assert type(classifier.predict(X)) == np.ndarray return corrects, acc, classifier
def test_hoeffding_tree_model_information(): stream = SEAGenerator(random_state=1, noise_percentage=0.05) X, y = get_next_n_samples(stream, 5000) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=[0, 1]) expected_info = { 'Tree size (nodes)': 5, 'Tree size (leaves)': 3, 'Active learning nodes': 3, 'Tree depth': 2, 'Active leaf byte size estimate': 0.0, 'Inactive leaf byte size estimate': 0.0, 'Byte size estimate overhead': 1.0 } observed_info = learner.model_measurements for k in expected_info: assert k in observed_info assert expected_info[k] == observed_info[k] expected_description = "if Attribute 0 <= 4.549969620513424:\n" \ " if Attribute 1 <= 5.440182925299016:\n" \ " Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \ " if Attribute 1 > 5.440182925299016:\n" \ " Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \ "if Attribute 0 > 4.549969620513424:\n" \ " Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \ assert expected_description == learner.get_model_description()
def test_hoeffding_tree_coverage(): # Cover memory management max_samples = 5000 max_size_kb = 50 stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=10, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=15, min_leaf_depth=3, fraction_leaves_per_level=0.15) nominal_attr_idx = [x for x in range(5, stream.n_features)] # Unconstrained model has over 72 kB learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx, leaf_prediction='mc', memory_estimate_period=100, max_byte_size=max_size_kb * 2**10) X, y = get_next_n_samples(stream, max_samples) learner.partial_fit(X, y) assert calculate_object_size(learner, 'kB') <= max_size_kb learner.reset()
def test_learn_pp(): stream = RandomTreeGenerator(tree_random_state=2212, sample_random_state=2212) estimator = DecisionTreeClassifier(random_state=2212) classifier = LearnPPClassifier(base_estimator=estimator, n_estimators=5, n_ensembles=5, random_state=2212) m = 200 # Keeping track of sample count and correct prediction count sample_count = 0 corrects = 0 # Pre training the classifier with 200 samples X, y = get_next_n_samples(stream, m) classifier.partial_fit(X, y, classes=stream.target_values) predictions = [] for i in range(10): X, y = get_next_n_samples(stream, 200) pred = classifier.predict(X) classifier.partial_fit(X, y) if pred is not None: corrects += np.sum(y == pred) predictions.append(pred[0]) sample_count += m acc = corrects / sample_count expected_correct_predictions = 1138 expected_acc = 0.569 expected_predictions = [0, 1, 0, 0, 1, 1, 0, 0, 0, 0] assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_acc, acc) assert corrects == expected_correct_predictions assert type(classifier.predict(X)) == np.ndarray expected_info = "LearnPPClassifier(base_estimator=DecisionTreeClassifier(" \ "random_state=2212), error_threshold=0.5, n_ensembles=5, " \ "n_estimators=5, random_state=2212, window_size=100)" info = " ".join([line.strip() for line in classifier.get_info().split()]) assert info == expected_info # For coverage purposes classifier.reset()
def test_adaptive_random_forests_nba(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) learner = AdaptiveRandomForestClassifier(n_estimators=3, random_state=112) X, y = get_next_n_samples(stream, 150) learner.partial_fit(X, y, classes=[0, 1]) # labels given cnt = 0 max_samples = 5000 y_proba = [] true_labels = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_proba.append(learner.predict_proba(X)[0]) true_labels.append(y[0]) learner.partial_fit(X, y) cnt += 1 assert np.alltrue([np.isclose(probabilities.sum(), 1) for probabilities in y_proba]), \ "Probabilities should sum to 1." y_proba = np.asarray(y_proba).squeeze() assert y_proba.shape == (49, 2) y_pred = y_proba.argmax(axis=1) y_pred_expected = [1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0] # Performance below does not need to be guaranteed. This check is set up so # that anything that changes to predictions are caught in the unit test. # This helps prevent accidental changes. assert type(learner.predict(X)) == np.ndarray assert np.alltrue(y_pred == y_pred_expected) expected_info = "AdaptiveRandomForestClassifier(binary_split=False, " \ "disable_weighted_vote=False, drift_detection_method=ADWIN(delta=0.001), " \ "grace_period=50, lambda_value=6, leaf_prediction='nba', " \ "max_byte_size=33554432, max_features=5, memory_estimate_period=2000000, " \ "n_estimators=3, nb_threshold=0, no_preprune=False, " \ "nominal_attributes=None, performance_metric='acc', random_state=112, " \ "remove_poor_atts=False, split_confidence=0.01, " \ "split_criterion='info_gain', stop_mem_management=False, " \ "tie_threshold=0.05, warning_detection_method=ADWIN(delta=0.01))" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def test_learn_pp_early_stop(): # Corner case where all observations belong to the same class: # not all ensemble members need to be trained (PR #223) stream = RandomTreeGenerator( tree_random_state=7, sample_random_state=8, n_classes=1 ) estimator = DecisionTreeClassifier(random_state=42) classifier = LearnPPClassifier( base_estimator=estimator, n_estimators=5, n_ensembles=5, random_state=7 ) m = 200 # Keeping track of sample count and correct prediction count sample_count = 0 corrects = 0 # Pre training the classifier with 200 samples X, y = get_next_n_samples(stream, m) classifier.partial_fit(X, y, classes=stream.target_values) predictions = [] for i in range(5): X, y = get_next_n_samples(stream, m) pred = classifier.predict(X) classifier.partial_fit(X, y) if pred is not None: corrects += np.sum(y == pred) predictions.append(pred[0]) sample_count += m acc = corrects / sample_count expected_correct_predictions = 1000 expected_acc = 1.0 expected_predictions = [0, 0, 0, 0, 0] assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_acc, acc) assert corrects == expected_correct_predictions
def test_batch_incremental(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112) estimator = DecisionTreeClassifier(random_state=112) learner = BatchIncrementalClassifier(base_estimator=estimator, n_estimators=10) X, y = get_next_n_samples(stream, 150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0 ] expected_correct_predictions = 31 expected_performance = 0.6326530612244898 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray expected_info = "BatchIncrementalClassifier(base_estimator=DecisionTreeClassifier("\ "random_state=112), n_estimators=10, window_size=100)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def test_multi_output_learner_regressor(): stream = RegressionGenerator(n_samples=5500, n_features=10, n_informative=20, n_targets=2, random_state=1) estimator = SGDRegressor(random_state=112, tol=1e-3, max_iter=10, loss='squared_loss') learner = MultiOutputLearner(base_estimator=estimator) X, y = get_next_n_samples(stream, 150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_targets = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) true_targets.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_performance = 2.444365309339395 performance = mean_absolute_error(true_targets, predictions) assert np.isclose(performance, expected_performance) assert learner._estimator_type == "regressor" assert type(learner.predict(X)) == np.ndarray with pytest.raises(AttributeError): learner.predict_proba(X)
def test_multi_output_learner_classifier(): stream = MultilabelGenerator(n_samples=5150, n_features=15, n_targets=3, n_labels=4, random_state=112) estimator = SGDClassifier(random_state=112, max_iter=10, loss='log') classifier = MultiOutputLearner(base_estimator=estimator) X, y = get_next_n_samples(stream, 150) classifier.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(classifier.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 classifier.partial_fit(X, y) cnt += 1 if LooseVersion(sklearn_version) < LooseVersion("0.21"): expected_predictions = [[1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 0.0], [0.0, 1.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_performance = 0.7755102040816326 performance = hamming_score(true_labels, predictions) assert np.isclose(performance, expected_performance) expected_info = "MultiOutputLearner(base_estimator=SGDClassifier(loss='log', " \ "random_state=112))" info = " ".join( [line.strip() for line in classifier.get_info().split()]) assert info == expected_info else: expected_predictions = [[1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 0.0], [1.0, 1.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0]] np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 23 assert correct_predictions == expected_correct_predictions expected_performance = 0.7482993197278911 performance = hamming_score(true_labels, predictions) assert np.isclose(performance, expected_performance) expected_info = "MultiOutputLearner(base_estimator=SGDClassifier(loss='log', " \ "max_iter=10, random_state=112))" info = " ".join( [line.strip() for line in classifier.get_info().split()]) assert info == expected_info assert type(classifier.predict(X)) == np.ndarray assert type(classifier.predict_proba(X)) == np.ndarray
def test_classifier_chains(): seed = 112 stream = MultilabelGenerator(random_state=seed, n_targets=3, n_samples=5150) estimator = SGDClassifier(random_state=seed, max_iter=10) learner = ClassifierChain(base_estimator=estimator, random_state=seed) X, y = get_next_n_samples(stream, 150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 if not sklearn_version.startswith("0.21"): expected_predictions = [[0., 0., 1.], [0., 0., 0.], [1., 0., 1.], [1., 0., 1.], [0., 0., 1.], [1., 0., 0.], [1., 0., 1.], [1., 0., 1.], [0., 0., 1.], [0., 0., 0.], [1., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [1., 0., 1.], [0., 0., 0.], [1., 0., 1.], [0., 0., 0.], [0., 1., 1.], [0., 1., 1.], [0., 0., 1.], [0., 1., 1.], [0., 1., 1.], [0., 1., 1.], [0., 1., 0.], [0., 1., 0.], [1., 1., 1.], [0., 1., 0.], [0., 1., 1.], [1., 0., 1.], [0., 1., 1.], [0., 0., 0.], [0., 0., 0.], [1., 0., 0.], [1., 1., 1.], [0., 1., 1.], [0., 0., 0.], [1., 0., 1.], [0., 0., 1.], [0., 0., 0.], [0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 0., 0.], [1., 1., 1.], [0., 0., 0.], [1., 1., 1.]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \ "random_state=112), order=None, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info else: expected_predictions = [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \ "random_state=112), order=None, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info assert type(learner.predict(X)) == np.ndarray
def test_regressor_chains(): X_reg, y_reg = make_regression(random_state=112, n_targets=3, n_samples=5150) array = [] for i in range(0, X_reg.shape[0]): array.append([X_reg[i].reshape(1, 100), y_reg[i].reshape(1, 3)]) estimator = SGDRegressor(random_state=112, max_iter=10) learner = RegressorChain(base_estimator=estimator, random_state=112) stream = FromArrayGenerator(array, False) X, y = get_next_n_samples(stream, 150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(list(learner.predict(X)[0])) true_labels.append(y[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = [[-21.932581119953333, 1265662295936.5574, 7.5406725414072326e+22], [-97.17297744582125, 5438576501559.791, -1.1370581201037737e+24], [-60.06308622605051, 26421144038311.047, 1.3207650552720094e+25], [-285.32687352244847, 8881551118262.033, -1.1322856827798374e+24], [-115.80322693771457, -24997431307818.508, 2.85747306174037e+24], [-12.184193815918672, 3510562166726.0283, -4.8590562435597834e+23], [-94.99008392491476, 4794062761133.606, -1.8849188211946465e+24], [66.35576182871232, -8147485653396.883, -7.492944375995595e+23], [-52.145505628056995, -1013810481101.9043, -4.5310283013446384e+23], [16.715060622072958, 562391244392.6193, 3.3789644409962397e+22], [96.32219400190282, -20397346086007.85, 1.558245298240083e+24], [-281.8168065846582, 118681520215938.52, 4.815807486956294e+25], [-135.62679760307105, 20260866750185.832, 1.605753540523006e+24], [0.07932047636460954, -708539394047.3298, -3.61482684929158e+22], [-292.1646176261883, -11162615183157.55, -8.674643964570704e+23], [-176.92746747754094, -29231218161585.13, 1.411600743825668e+24], [-348.0498644784687, -100615393132365.25, 9.759683002046948e+23], [30.948974669258675, -1199287119275.6328, 2.0866927007519847e+23], [214.0020659569134, -24437173206276.543, 9.450880718880671e+23], [153.98931593720746, 32675842205528.723, -1.7246747286222668e+24], [99.39074016354951, -11385065116243.611, 1.0770253102805811e+24], [127.81660709796127, 16929726964275.697, 7.14820947257164e+24], [40.45505653639006, -14311951591200.725, -9.33193290094133e+23], [117.52219878440611, 17952367624051.36, 4.5651719663788677e+23], [75.53942801239991, -9231543699137.594, 3.2317133158453914e+24], [31.795193207760704, -4084783706153.4004, -4.188095047309216e+23], [68.5318978502461, 5735810247065.921, 1.7284713503779943e+24], [65.18438567482129, -13298743450357.943, -1.4367047198923567e+24], [-116.63952028337805, -344127767223.9295, 2.3925104169428623e+22], [-76.81599010889556, 8711205431447.733, -1.1575305916673031e+24], [263.1077717649874, 32146618104196.434, -7.240279466740839e+24], [-94.07597099457413, -8216681977657.527, 2.3785728690780553e+24], [-175.78429788635424, -368856885004.46, -5.7200993095587195e+22], [59.648477499483285, -1752783828320.242, 2.1429953624557326e+23], [71.68447202426032, -27151271800666.492, 9.367463190825582e+24], [-189.96629636835922, -27090727476080.18, -3.8659883994544866e+24], [-240.7920206809074, 15406047062899.537, 2.0609123388035027e+24], [-105.80996634043589, -1518636404558.1646, -1.4166487855869706e+23], [-164.02527753963858, -61386039046571.125, -2.179071650432624e+25], [52.451759456657975, -988509747123.6125, -7.334899319683594e+22], [68.37044139814127, -7434200892467.581, -7.535677215142279e+23], [164.9457843624521, -9474550940989.51, -1.3512944635293625e+24], [189.34401690407307, -14349556896444.508, 1.0732760415617274e+24], [0.8944005517286119, 463945767759.78735, -1.9938544157612443e+22], [71.7856433565235, -9804063257174.584, 4.7874862540754335e+23], [-5.450502769025279, 281585481223.33276, 2.1974700575843552e+22], [248.00190755589915, -81874135462745.58, -2.6532557110860303e+25], [-113.86249490223707, 2634310697909.643, 1.580428629322546e+23], [-35.92856878407447, -5410985463428.589, 2.522168862637753e+23]] assert np.allclose(np.array(predictions).all(), np.array(expected_predictions).all()) assert type(learner.predict(X)) == np.ndarray expected_info = "RegressorChain(base_estimator=SGDRegressor(max_iter=10, random_state=112), " \ "order=None, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def test_hoeffding_adaptive_tree_mc(test_path): stream = ConceptDriftStreamGenerator( stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator(random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) learner = HoeffdingAdaptiveTreeClassifier(leaf_prediction='mc', random_state=1) cnt = 0 max_samples = 1000 y_pred = array('i') y_proba = [] wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_mc.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = "HoeffdingAdaptiveTreeClassifier(binary_split=False, bootstrap_sampling=True, grace_period=200, " \ "leaf_prediction='mc', max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, " \ "no_preprune=False, nominal_attributes=None, random_state=1, remove_poor_atts=False, " \ "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info expected_model_1 = 'Leaf = Class 1 | {0: 398.0, 1: 1000.0}\n' assert (learner.get_model_description() == expected_model_1) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray stream = ConceptDriftStreamGenerator( stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator(random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) X, y = get_next_n_samples(stream, 5000) learner = HoeffdingAdaptiveTreeClassifier(max_byte_size=30, leaf_prediction='mc', grace_period=10) learner.partial_fit(X, y)
def test_learn_nse(): stream = SEAGenerator(random_state=2212) estimator = GaussianNB() corrects, acc, classifier = run_classifier(estimator, stream) expected_correct_predictions = 1754 expected_acc = 0.877 assert np.isclose(expected_acc, acc) assert corrects == expected_correct_predictions # Test reset method classifier.reset() assert len(classifier.ensemble) == 0 assert len(classifier.ensemble_weights) == 0 assert len(classifier.bkts) == 0 assert len(classifier.wkts) == 0 assert len(classifier.X_batch) == 0 assert len(classifier.y_batch) == 0 expected_info = 'LearnPPNSEClassifier(base_estimator=GaussianNB(), crossing_point=10, ' \ 'n_estimators=15, pruning=None, slope=0.5, window_size=250)' info = " ".join([line.strip() for line in classifier.get_info().split()]) assert info == expected_info # test pruning error corrects, acc, classifier = run_classifier(estimator, stream, pruning="error", ensemble_size=5) expected_correct_predictions = 1751 expected_acc = 0.8755 assert np.isclose(expected_acc, acc) assert corrects == expected_correct_predictions # test pruning age corrects, acc, classifier = run_classifier(estimator, stream, pruning="age", ensemble_size=5) expected_correct_predictions = 1774 expected_acc = 0.887 assert np.isclose(expected_acc, acc) assert corrects == expected_correct_predictions stream = SEAGenerator(random_state=2212) estimator = HoeffdingTreeClassifier() classifier = LearnPPNSEClassifier(base_estimator=estimator) # Keeping track of sample count and correct prediction count sample_count = 0 corrects = 0 m = 250 # Pre training the classifier X, y = get_next_n_samples(stream, m) classifier.partial_fit(X, y, classes=[0, 1]) # print(classifier.ensemble_weights) for i in range(10): X, y = get_next_n_samples(stream, m) pred = classifier.predict(X) classifier.partial_fit(X, y) if pred is not None: # print(pred) corrects += np.sum(y == pred) sample_count += m acc = corrects / sample_count expected_acc = 0.9436 assert acc == expected_acc
def test_learn_nse_different_proba_sizes(): m = 250 stream = RandomTreeGenerator(tree_random_state=7, sample_random_state=8, n_classes=2) dt = DecisionTreeClassifier(random_state=7) classifier = LearnPPNSEClassifier(base_estimator=dt, window_size=250) # Pre training the classifier with 250 samples X, y = get_next_n_samples(stream, m) # Set manually classes classifier.partial_fit(X, y, classes=np.array([0, 1, 2, 3])) X, y = get_next_n_samples(stream, m) y[y == 0] = 3 y[y == 1] = 2 # pred = classifier.predict(X) classifier.partial_fit(X, y) X, y = get_next_n_samples(stream, m) y[y == 0] = 3 pred = classifier.predict(X) classifier.partial_fit(X, y) if pred is not None: corrects = np.sum(y == pred) expected_correct_predictions = 115 assert corrects == expected_correct_predictions stream = RandomTreeGenerator(tree_random_state=7, sample_random_state=8, n_classes=2) # Repeating process with a skmultiflow-based learner ht = HoeffdingTreeClassifier(leaf_prediction='mc') classifier = LearnPPNSEClassifier(base_estimator=ht, window_size=250) # Pre training the classifier with 250 samples X, y = get_next_n_samples(stream, m) # Forcing exception to increase coverage with pytest.raises(RuntimeError): classifier.partial_fit(X, y, classes=None) classifier.reset() # Set manually classes classifier.partial_fit(X, y, classes=np.array([0, 1, 2, 3])) X, y = get_next_n_samples(stream, m) y[y == 0] = 3 y[y == 1] = 2 # pred = classifier.predict(X) classifier.partial_fit(X, y) X, y = get_next_n_samples(stream, m) y[y == 0] = 3 pred = classifier.predict(X) if pred is not None: corrects = np.sum(y == pred) expected_correct_predictions = 109 assert corrects == expected_correct_predictions
def test_hoeffding_tree_nba(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3, 2 ]) test_file = os.path.join(test_path, 'test_hoeffding_tree.npy') data = np.load(test_file) assert np.alltrue(predictions == expected_predictions) assert np.allclose(proba_predictions, data) expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nba', " \ "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \ "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \ "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \ "tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info expected_model_1 = 'Leaf = Class 1 | {0: 1423.0, 1: 1745.0, 2: 978.0, 3: 854.0}\n' assert (learner.get_model_description() == expected_model_1) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray X, y = get_next_n_samples(stream, 20000) learner.split_criterion = 'hellinger' learner.partial_fit(X, y) expected_rules = 'Att (5) == 0.000 and Att (12) == 0.000 | class: 1\n' + \ 'Att (5) == 0.000 and Att (12) == 1.000 | class: 1\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) <= 0.730 | class: 0\n' +\ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) > 0.730 | class: 2\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) <= 0.800 | class: 0\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 0.000' \ ' | class: 0\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 1.000' \ ' | class: 1\n' + \ 'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) <= 0.730 | class: 1\n' + \ 'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) > 0.730 | class: 0\n' assert expected_rules == learner.get_rules_description()