def test_extremely_fast_decision_tree_coverage():
    # Cover memory management
    max_size_kb = 20
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    X, y = get_next_n_samples(stream, 5000)

    # Unconstrained model has over 50 kB
    learner = ExtremelyFastDecisionTreeClassifier(leaf_prediction='mc',
                                                  memory_estimate_period=200,
                                                  max_byte_size=max_size_kb *
                                                  2**10,
                                                  min_samples_reevaluate=2500)

    learner.partial_fit(X, y, classes=[0, 1])
    assert calculate_object_size(learner, 'kB') <= max_size_kb

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=2,
                                 n_cat_features=2,
                                 n_categories_per_cat_feature=4,
                                 n_num_features=1,
                                 max_tree_depth=30,
                                 min_leaf_depth=10,
                                 fraction_leaves_per_level=0.45)
    X, y = get_next_n_samples(stream, 5000)
    learner = ExtremelyFastDecisionTreeClassifier(
        leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)])
    learner.partial_fit(X, y, classes=[0, 1])
def run_classifier(estimator, stream, pruning=None, ensemble_size=15, m=200):
    classifier = LearnPPNSEClassifier(base_estimator=estimator,
                                      window_size=250,
                                      pruning=pruning,
                                      slope=0.5,
                                      crossing_point=10,
                                      n_estimators=ensemble_size)

    # Keeping track of sample count and correct prediction count
    sample_count = 0
    corrects = 0

    # Pre training the classifier with 200 samples
    X, y = get_next_n_samples(stream, m)
    classifier.partial_fit(X, y, classes=[0, 1])

    for i in range(10):
        X, y = get_next_n_samples(stream, m)
        pred = classifier.predict(X)
        classifier.partial_fit(X, y)

        if pred is not None:
            corrects += np.sum(y == pred)
        sample_count += m

    acc = corrects / sample_count

    assert type(classifier.predict(X)) == np.ndarray

    return corrects, acc, classifier
Exemple #3
0
def test_hoeffding_tree_model_information():
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    X, y = get_next_n_samples(stream, 5000)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y, classes=[0, 1])

    expected_info = {
        'Tree size (nodes)': 5,
        'Tree size (leaves)': 3,
        'Active learning nodes': 3,
        'Tree depth': 2,
        'Active leaf byte size estimate': 0.0,
        'Inactive leaf byte size estimate': 0.0,
        'Byte size estimate overhead': 1.0
    }

    observed_info = learner.model_measurements
    for k in expected_info:
        assert k in observed_info
        assert expected_info[k] == observed_info[k]

    expected_description = "if Attribute 0 <= 4.549969620513424:\n" \
                            "  if Attribute 1 <= 5.440182925299016:\n" \
                            "    Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \
                            "  if Attribute 1 > 5.440182925299016:\n" \
                            "    Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \
                            "if Attribute 0 > 4.549969620513424:\n" \
                            "  Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \

    assert expected_description == learner.get_model_description()
Exemple #4
0
def test_hoeffding_tree_coverage():
    # Cover memory management
    max_samples = 5000
    max_size_kb = 50
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=10,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=15,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    # Unconstrained model has over 72 kB
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx,
                                      leaf_prediction='mc',
                                      memory_estimate_period=100,
                                      max_byte_size=max_size_kb * 2**10)

    X, y = get_next_n_samples(stream, max_samples)
    learner.partial_fit(X, y)

    assert calculate_object_size(learner, 'kB') <= max_size_kb

    learner.reset()
def test_learn_pp():
    stream = RandomTreeGenerator(tree_random_state=2212, sample_random_state=2212)

    estimator = DecisionTreeClassifier(random_state=2212)
    classifier = LearnPPClassifier(base_estimator=estimator, n_estimators=5, n_ensembles=5,
                                   random_state=2212)

    m = 200

    # Keeping track of sample count and correct prediction count
    sample_count = 0
    corrects = 0

    # Pre training the classifier with 200 samples
    X, y = get_next_n_samples(stream, m)
    classifier.partial_fit(X, y, classes=stream.target_values)
    predictions = []

    for i in range(10):
        X, y = get_next_n_samples(stream, 200)
        pred = classifier.predict(X)
        classifier.partial_fit(X, y)

        if pred is not None:
            corrects += np.sum(y == pred)
            predictions.append(pred[0])
        sample_count += m

    acc = corrects / sample_count

    expected_correct_predictions = 1138
    expected_acc = 0.569
    expected_predictions = [0, 1, 0, 0, 1, 1, 0, 0, 0, 0]

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_acc, acc)
    assert corrects == expected_correct_predictions
    assert type(classifier.predict(X)) == np.ndarray

    expected_info = "LearnPPClassifier(base_estimator=DecisionTreeClassifier(" \
                    "random_state=2212), error_threshold=0.5, n_ensembles=5, " \
                    "n_estimators=5, random_state=2212, window_size=100)"
    info = " ".join([line.strip() for line in classifier.get_info().split()])
    assert info == expected_info

    # For coverage purposes
    classifier.reset()
def test_adaptive_random_forests_nba():
    stream = RandomTreeGenerator(tree_random_state=112,
                                 sample_random_state=112,
                                 n_classes=2)

    learner = AdaptiveRandomForestClassifier(n_estimators=3,
                                             random_state=112)

    X, y = get_next_n_samples(stream, 150)
    learner.partial_fit(X, y, classes=[0, 1])  # labels given

    cnt = 0
    max_samples = 5000
    y_proba = []
    true_labels = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_proba.append(learner.predict_proba(X)[0])
            true_labels.append(y[0])

        learner.partial_fit(X, y)
        cnt += 1

    assert np.alltrue([np.isclose(probabilities.sum(), 1) for probabilities in y_proba]), \
        "Probabilities should sum to 1."

    y_proba = np.asarray(y_proba).squeeze()
    assert y_proba.shape == (49, 2)

    y_pred = y_proba.argmax(axis=1)
    y_pred_expected = [1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
                       1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
                       1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
                       1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
                       0, 0, 0, 1, 1, 1, 0, 0, 0]

    # Performance below does not need to be guaranteed. This check is set up so
    # that anything that changes to predictions are caught in the unit test.
    # This helps prevent accidental changes.

    assert type(learner.predict(X)) == np.ndarray
    assert np.alltrue(y_pred == y_pred_expected)

    expected_info = "AdaptiveRandomForestClassifier(binary_split=False, " \
                    "disable_weighted_vote=False, drift_detection_method=ADWIN(delta=0.001), " \
                    "grace_period=50, lambda_value=6, leaf_prediction='nba', " \
                    "max_byte_size=33554432, max_features=5, memory_estimate_period=2000000, " \
                    "n_estimators=3, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=None, performance_metric='acc', random_state=112, " \
                    "remove_poor_atts=False, split_confidence=0.01, " \
                    "split_criterion='info_gain', stop_mem_management=False, " \
                    "tie_threshold=0.05, warning_detection_method=ADWIN(delta=0.01))"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
def test_learn_pp_early_stop():
    # Corner case where all observations belong to the same class:
    # not all ensemble members need to be trained (PR #223)
    stream = RandomTreeGenerator(
        tree_random_state=7, sample_random_state=8, n_classes=1
    )

    estimator = DecisionTreeClassifier(random_state=42)
    classifier = LearnPPClassifier(
        base_estimator=estimator, n_estimators=5, n_ensembles=5,
        random_state=7
    )

    m = 200

    # Keeping track of sample count and correct prediction count
    sample_count = 0
    corrects = 0

    # Pre training the classifier with 200 samples
    X, y = get_next_n_samples(stream, m)
    classifier.partial_fit(X, y, classes=stream.target_values)
    predictions = []

    for i in range(5):
        X, y = get_next_n_samples(stream, m)
        pred = classifier.predict(X)
        classifier.partial_fit(X, y)

        if pred is not None:
            corrects += np.sum(y == pred)
            predictions.append(pred[0])
        sample_count += m

    acc = corrects / sample_count

    expected_correct_predictions = 1000
    expected_acc = 1.0
    expected_predictions = [0, 0, 0, 0, 0]

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_acc, acc)
    assert corrects == expected_correct_predictions
def test_batch_incremental():
    stream = RandomTreeGenerator(tree_random_state=112,
                                 sample_random_state=112)

    estimator = DecisionTreeClassifier(random_state=112)
    learner = BatchIncrementalClassifier(base_estimator=estimator,
                                         n_estimators=10)

    X, y = get_next_n_samples(stream, 150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1

    performance = correct_predictions / len(predictions)
    expected_predictions = [
        1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
        1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
        1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0
    ]

    expected_correct_predictions = 31
    expected_performance = 0.6326530612244898

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray

    expected_info = "BatchIncrementalClassifier(base_estimator=DecisionTreeClassifier("\
                    "random_state=112), n_estimators=10, window_size=100)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
Exemple #9
0
def test_multi_output_learner_regressor():

    stream = RegressionGenerator(n_samples=5500,
                                 n_features=10,
                                 n_informative=20,
                                 n_targets=2,
                                 random_state=1)

    estimator = SGDRegressor(random_state=112,
                             tol=1e-3,
                             max_iter=10,
                             loss='squared_loss')
    learner = MultiOutputLearner(base_estimator=estimator)

    X, y = get_next_n_samples(stream, 150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_targets = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            true_targets.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1

    expected_performance = 2.444365309339395
    performance = mean_absolute_error(true_targets, predictions)
    assert np.isclose(performance, expected_performance)

    assert learner._estimator_type == "regressor"
    assert type(learner.predict(X)) == np.ndarray

    with pytest.raises(AttributeError):
        learner.predict_proba(X)
Exemple #10
0
def test_multi_output_learner_classifier():

    stream = MultilabelGenerator(n_samples=5150,
                                 n_features=15,
                                 n_targets=3,
                                 n_labels=4,
                                 random_state=112)

    estimator = SGDClassifier(random_state=112, max_iter=10, loss='log')
    classifier = MultiOutputLearner(base_estimator=estimator)

    X, y = get_next_n_samples(stream, 150)
    classifier.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(classifier.predict(X)[0])
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        classifier.partial_fit(X, y)
        cnt += 1

    if LooseVersion(sklearn_version) < LooseVersion("0.21"):
        expected_predictions = [[1.0, 1.0, 1.0], [1.0, 0.0, 1.0],
                                [1.0, 1.0, 1.0], [1.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [0.0, 0.0, 1.0],
                                [1.0, 1.0, 1.0], [0.0, 0.0, 1.0],
                                [0.0, 1.0, 0.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [1.0, 0.0, 0.0],
                                [0.0, 1.0, 1.0], [1.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [0.0, 1.0, 0.0],
                                [0.0, 0.0, 1.0], [1.0, 0.0, 1.0],
                                [1.0, 1.0, 1.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0], [1.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 0.0], [0.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [1.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [0.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [1.0, 0.0, 0.0], [0.0, 1.0, 1.0],
                                [0.0, 0.0, 1.0], [1.0, 1.0, 0.0],
                                [1.0, 0.0, 1.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 0.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0]]
        assert np.alltrue(np.array_equal(predictions, expected_predictions))

        expected_correct_predictions = 26
        assert correct_predictions == expected_correct_predictions

        expected_performance = 0.7755102040816326
        performance = hamming_score(true_labels, predictions)
        assert np.isclose(performance, expected_performance)

        expected_info = "MultiOutputLearner(base_estimator=SGDClassifier(loss='log', " \
                        "random_state=112))"
        info = " ".join(
            [line.strip() for line in classifier.get_info().split()])
        assert info == expected_info

    else:
        expected_predictions = [[1.0, 1.0, 1.0], [1.0, 0.0, 1.0],
                                [1.0, 1.0, 1.0], [1.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [1.0, 0.0, 1.0],
                                [1.0, 0.0, 1.0], [1.0, 0.0, 1.0],
                                [0.0, 1.0, 0.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [1.0, 1.0, 0.0],
                                [1.0, 1.0, 1.0], [1.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [1.0, 1.0, 1.0],
                                [1.0, 0.0, 1.0], [1.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 0.0], [1.0, 0.0, 1.0],
                                [0.0, 1.0, 1.0], [1.0, 1.0, 0.0],
                                [1.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [1.0, 0.0, 0.0], [0.0, 1.0, 1.0],
                                [0.0, 0.0, 1.0], [1.0, 1.0, 0.0],
                                [1.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [1.0, 1.0, 1.0],
                                [1.0, 0.0, 0.0], [1.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0]]
        np.alltrue(np.array_equal(predictions, expected_predictions))

        expected_correct_predictions = 23
        assert correct_predictions == expected_correct_predictions

        expected_performance = 0.7482993197278911
        performance = hamming_score(true_labels, predictions)
        assert np.isclose(performance, expected_performance)

        expected_info = "MultiOutputLearner(base_estimator=SGDClassifier(loss='log', " \
                        "max_iter=10, random_state=112))"

        info = " ".join(
            [line.strip() for line in classifier.get_info().split()])
        assert info == expected_info

    assert type(classifier.predict(X)) == np.ndarray
    assert type(classifier.predict_proba(X)) == np.ndarray
def test_classifier_chains():
    seed = 112
    stream = MultilabelGenerator(random_state=seed,
                                 n_targets=3,
                                 n_samples=5150)

    estimator = SGDClassifier(random_state=seed, max_iter=10)
    learner = ClassifierChain(base_estimator=estimator, random_state=seed)
    X, y = get_next_n_samples(stream, 150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1

    if not sklearn_version.startswith("0.21"):
        expected_predictions = [[0., 0., 1.], [0., 0., 0.], [1., 0., 1.],
                                [1., 0., 1.], [0., 0., 1.], [1., 0., 0.],
                                [1., 0., 1.], [1., 0., 1.], [0., 0., 1.],
                                [0., 0., 0.], [1., 0., 1.], [0., 0., 1.],
                                [0., 0., 1.], [0., 0., 1.], [0., 0., 1.],
                                [0., 0., 1.], [1., 0., 1.], [0., 0., 0.],
                                [1., 0., 1.], [0., 0., 0.], [0., 1., 1.],
                                [0., 1., 1.], [0., 0., 1.], [0., 1., 1.],
                                [0., 1., 1.], [0., 1., 1.], [0., 1., 0.],
                                [0., 1., 0.], [1., 1., 1.], [0., 1., 0.],
                                [0., 1., 1.], [1., 0., 1.], [0., 1., 1.],
                                [0., 0., 0.], [0., 0., 0.], [1., 0., 0.],
                                [1., 1., 1.], [0., 1., 1.], [0., 0., 0.],
                                [1., 0., 1.], [0., 0., 1.], [0., 0., 0.],
                                [0., 0., 0.], [0., 0., 1.], [0., 1., 0.],
                                [0., 0., 0.], [1., 1., 1.], [0., 0., 0.],
                                [1., 1., 1.]]
        assert np.alltrue(np.array_equal(predictions, expected_predictions))

        expected_correct_predictions = 26
        assert correct_predictions == expected_correct_predictions

        expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \
                        "random_state=112), order=None, random_state=112)"
        info = " ".join([line.strip() for line in learner.get_info().split()])
        assert info == expected_info

    else:
        expected_predictions = [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [1.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [1.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [0.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [0.0, 1.0, 0.0], [0.0, 1.0, 0.0],
                                [1.0, 1.0, 1.0], [0.0, 1.0, 0.0],
                                [0.0, 1.0, 1.0], [1.0, 0.0, 1.0],
                                [0.0, 1.0, 1.0], [0.0, 0.0, 0.0],
                                [0.0, 0.0, 0.0], [1.0, 0.0, 0.0],
                                [1.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [0.0, 0.0, 0.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [0.0, 0.0, 0.0], [0.0, 0.0, 1.0],
                                [0.0, 1.0, 0.0], [0.0, 0.0, 0.0],
                                [1.0, 1.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 1.0, 1.0]]
        assert np.alltrue(np.array_equal(predictions, expected_predictions))

        expected_correct_predictions = 26
        assert correct_predictions == expected_correct_predictions

        expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \
                        "random_state=112), order=None, random_state=112)"
        info = " ".join([line.strip() for line in learner.get_info().split()])
        assert info == expected_info

    assert type(learner.predict(X)) == np.ndarray
def test_regressor_chains():
    X_reg, y_reg = make_regression(random_state=112, n_targets=3, n_samples=5150)
    array = []
    for i in range(0, X_reg.shape[0]):
        array.append([X_reg[i].reshape(1, 100), y_reg[i].reshape(1, 3)])

    estimator = SGDRegressor(random_state=112, max_iter=10)
    learner = RegressorChain(base_estimator=estimator, random_state=112)

    stream = FromArrayGenerator(array, False)

    X, y = get_next_n_samples(stream, 150)

    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(list(learner.predict(X)[0]))
            true_labels.append(y[0])

        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = [[-21.932581119953333, 1265662295936.5574, 7.5406725414072326e+22],
                            [-97.17297744582125, 5438576501559.791, -1.1370581201037737e+24],
                            [-60.06308622605051, 26421144038311.047, 1.3207650552720094e+25],
                            [-285.32687352244847, 8881551118262.033, -1.1322856827798374e+24],
                            [-115.80322693771457, -24997431307818.508, 2.85747306174037e+24],
                            [-12.184193815918672, 3510562166726.0283, -4.8590562435597834e+23],
                            [-94.99008392491476, 4794062761133.606, -1.8849188211946465e+24],
                            [66.35576182871232, -8147485653396.883, -7.492944375995595e+23],
                            [-52.145505628056995, -1013810481101.9043, -4.5310283013446384e+23],
                            [16.715060622072958, 562391244392.6193, 3.3789644409962397e+22],
                            [96.32219400190282, -20397346086007.85, 1.558245298240083e+24],
                            [-281.8168065846582, 118681520215938.52, 4.815807486956294e+25],
                            [-135.62679760307105, 20260866750185.832, 1.605753540523006e+24],
                            [0.07932047636460954, -708539394047.3298, -3.61482684929158e+22],
                            [-292.1646176261883, -11162615183157.55, -8.674643964570704e+23],
                            [-176.92746747754094, -29231218161585.13, 1.411600743825668e+24],
                            [-348.0498644784687, -100615393132365.25, 9.759683002046948e+23],
                            [30.948974669258675, -1199287119275.6328, 2.0866927007519847e+23],
                            [214.0020659569134, -24437173206276.543, 9.450880718880671e+23],
                            [153.98931593720746, 32675842205528.723, -1.7246747286222668e+24],
                            [99.39074016354951, -11385065116243.611, 1.0770253102805811e+24],
                            [127.81660709796127, 16929726964275.697, 7.14820947257164e+24],
                            [40.45505653639006, -14311951591200.725, -9.33193290094133e+23],
                            [117.52219878440611, 17952367624051.36, 4.5651719663788677e+23],
                            [75.53942801239991, -9231543699137.594, 3.2317133158453914e+24],
                            [31.795193207760704, -4084783706153.4004, -4.188095047309216e+23],
                            [68.5318978502461, 5735810247065.921, 1.7284713503779943e+24],
                            [65.18438567482129, -13298743450357.943, -1.4367047198923567e+24],
                            [-116.63952028337805, -344127767223.9295, 2.3925104169428623e+22],
                            [-76.81599010889556, 8711205431447.733, -1.1575305916673031e+24],
                            [263.1077717649874, 32146618104196.434, -7.240279466740839e+24],
                            [-94.07597099457413, -8216681977657.527, 2.3785728690780553e+24],
                            [-175.78429788635424, -368856885004.46, -5.7200993095587195e+22],
                            [59.648477499483285, -1752783828320.242, 2.1429953624557326e+23],
                            [71.68447202426032, -27151271800666.492, 9.367463190825582e+24],
                            [-189.96629636835922, -27090727476080.18, -3.8659883994544866e+24],
                            [-240.7920206809074, 15406047062899.537, 2.0609123388035027e+24],
                            [-105.80996634043589, -1518636404558.1646, -1.4166487855869706e+23],
                            [-164.02527753963858, -61386039046571.125, -2.179071650432624e+25],
                            [52.451759456657975, -988509747123.6125, -7.334899319683594e+22],
                            [68.37044139814127, -7434200892467.581, -7.535677215142279e+23],
                            [164.9457843624521, -9474550940989.51, -1.3512944635293625e+24],
                            [189.34401690407307, -14349556896444.508, 1.0732760415617274e+24],
                            [0.8944005517286119, 463945767759.78735, -1.9938544157612443e+22],
                            [71.7856433565235, -9804063257174.584, 4.7874862540754335e+23],
                            [-5.450502769025279, 281585481223.33276, 2.1974700575843552e+22],
                            [248.00190755589915, -81874135462745.58, -2.6532557110860303e+25],
                            [-113.86249490223707, 2634310697909.643, 1.580428629322546e+23],
                            [-35.92856878407447, -5410985463428.589, 2.522168862637753e+23]]

    assert np.allclose(np.array(predictions).all(), np.array(expected_predictions).all())
    assert type(learner.predict(X)) == np.ndarray

    expected_info = "RegressorChain(base_estimator=SGDRegressor(max_iter=10, random_state=112), " \
                    "order=None, random_state=112)"

    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
def test_hoeffding_adaptive_tree_mc(test_path):
    stream = ConceptDriftStreamGenerator(
        stream=SEAGenerator(random_state=1, noise_percentage=0.05),
        drift_stream=SEAGenerator(random_state=2,
                                  classification_function=2,
                                  noise_percentage=0.05),
        random_state=1,
        position=250,
        width=10)

    learner = HoeffdingAdaptiveTreeClassifier(leaf_prediction='mc',
                                              random_state=1)

    cnt = 0
    max_samples = 1000
    y_pred = array('i')
    y_proba = []
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_mc.npy')
    data = np.load(test_file)
    assert np.allclose(y_proba, data)

    expected_info = "HoeffdingAdaptiveTreeClassifier(binary_split=False, bootstrap_sampling=True, grace_period=200, " \
                    "leaf_prediction='mc', max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, " \
                    "no_preprune=False, nominal_attributes=None, random_state=1, remove_poor_atts=False, " \
                    "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])

    assert info == expected_info

    expected_model_1 = 'Leaf = Class 1 | {0: 398.0, 1: 1000.0}\n'

    assert (learner.get_model_description() == expected_model_1)

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    stream = ConceptDriftStreamGenerator(
        stream=SEAGenerator(random_state=1, noise_percentage=0.05),
        drift_stream=SEAGenerator(random_state=2,
                                  classification_function=2,
                                  noise_percentage=0.05),
        random_state=1,
        position=250,
        width=10)
    X, y = get_next_n_samples(stream, 5000)

    learner = HoeffdingAdaptiveTreeClassifier(max_byte_size=30,
                                              leaf_prediction='mc',
                                              grace_period=10)
    learner.partial_fit(X, y)
def test_learn_nse():
    stream = SEAGenerator(random_state=2212)

    estimator = GaussianNB()

    corrects, acc, classifier = run_classifier(estimator, stream)

    expected_correct_predictions = 1754
    expected_acc = 0.877

    assert np.isclose(expected_acc, acc)
    assert corrects == expected_correct_predictions

    # Test reset method
    classifier.reset()
    assert len(classifier.ensemble) == 0
    assert len(classifier.ensemble_weights) == 0
    assert len(classifier.bkts) == 0
    assert len(classifier.wkts) == 0
    assert len(classifier.X_batch) == 0
    assert len(classifier.y_batch) == 0

    expected_info = 'LearnPPNSEClassifier(base_estimator=GaussianNB(), crossing_point=10, ' \
                    'n_estimators=15, pruning=None, slope=0.5, window_size=250)'
    info = " ".join([line.strip() for line in classifier.get_info().split()])
    assert info == expected_info
    # test pruning error
    corrects, acc, classifier = run_classifier(estimator,
                                               stream,
                                               pruning="error",
                                               ensemble_size=5)

    expected_correct_predictions = 1751
    expected_acc = 0.8755

    assert np.isclose(expected_acc, acc)
    assert corrects == expected_correct_predictions

    # test pruning age
    corrects, acc, classifier = run_classifier(estimator,
                                               stream,
                                               pruning="age",
                                               ensemble_size=5)

    expected_correct_predictions = 1774
    expected_acc = 0.887

    assert np.isclose(expected_acc, acc)
    assert corrects == expected_correct_predictions

    stream = SEAGenerator(random_state=2212)

    estimator = HoeffdingTreeClassifier()

    classifier = LearnPPNSEClassifier(base_estimator=estimator)

    # Keeping track of sample count and correct prediction count
    sample_count = 0
    corrects = 0

    m = 250
    # Pre training the classifier
    X, y = get_next_n_samples(stream, m)
    classifier.partial_fit(X, y, classes=[0, 1])

    # print(classifier.ensemble_weights)
    for i in range(10):
        X, y = get_next_n_samples(stream, m)
        pred = classifier.predict(X)
        classifier.partial_fit(X, y)

        if pred is not None:
            # print(pred)
            corrects += np.sum(y == pred)
        sample_count += m

    acc = corrects / sample_count
    expected_acc = 0.9436
    assert acc == expected_acc
def test_learn_nse_different_proba_sizes():
    m = 250
    stream = RandomTreeGenerator(tree_random_state=7,
                                 sample_random_state=8,
                                 n_classes=2)

    dt = DecisionTreeClassifier(random_state=7)
    classifier = LearnPPNSEClassifier(base_estimator=dt, window_size=250)

    # Pre training the classifier with 250 samples
    X, y = get_next_n_samples(stream, m)

    # Set manually classes
    classifier.partial_fit(X, y, classes=np.array([0, 1, 2, 3]))

    X, y = get_next_n_samples(stream, m)
    y[y == 0] = 3
    y[y == 1] = 2

    # pred = classifier.predict(X)
    classifier.partial_fit(X, y)

    X, y = get_next_n_samples(stream, m)
    y[y == 0] = 3

    pred = classifier.predict(X)
    classifier.partial_fit(X, y)

    if pred is not None:
        corrects = np.sum(y == pred)

    expected_correct_predictions = 115
    assert corrects == expected_correct_predictions

    stream = RandomTreeGenerator(tree_random_state=7,
                                 sample_random_state=8,
                                 n_classes=2)
    # Repeating process with a skmultiflow-based learner
    ht = HoeffdingTreeClassifier(leaf_prediction='mc')
    classifier = LearnPPNSEClassifier(base_estimator=ht, window_size=250)

    # Pre training the classifier with 250 samples
    X, y = get_next_n_samples(stream, m)

    # Forcing exception to increase coverage
    with pytest.raises(RuntimeError):
        classifier.partial_fit(X, y, classes=None)

    classifier.reset()
    # Set manually classes
    classifier.partial_fit(X, y, classes=np.array([0, 1, 2, 3]))

    X, y = get_next_n_samples(stream, m)
    y[y == 0] = 3
    y[y == 1] = 2

    # pred = classifier.predict(X)
    classifier.partial_fit(X, y)

    X, y = get_next_n_samples(stream, m)
    y[y == 0] = 3

    pred = classifier.predict(X)

    if pred is not None:
        corrects = np.sum(y == pred)

    expected_correct_predictions = 109
    assert corrects == expected_correct_predictions
Exemple #16
0
def test_hoeffding_tree_nba(test_path):
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=4,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    cnt = 0
    max_samples = 5000
    predictions = array('i')
    proba_predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            proba_predictions.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3,
        1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3,
        2
    ])

    test_file = os.path.join(test_path, 'test_hoeffding_tree.npy')

    data = np.load(test_file)

    assert np.alltrue(predictions == expected_predictions)
    assert np.allclose(proba_predictions, data)

    expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nba', " \
                    "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \
                    "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \
                    "tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    expected_model_1 = 'Leaf = Class 1 | {0: 1423.0, 1: 1745.0, 2: 978.0, 3: 854.0}\n'

    assert (learner.get_model_description() == expected_model_1)
    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    X, y = get_next_n_samples(stream, 20000)
    learner.split_criterion = 'hellinger'
    learner.partial_fit(X, y)

    expected_rules = 'Att (5) == 0.000 and Att (12) == 0.000 | class: 1\n' + \
        'Att (5) == 0.000 and Att (12) == 1.000 | class: 1\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) <= 0.730 | class: 0\n' +\
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) > 0.730 | class: 2\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) <= 0.800 | class: 0\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 0.000' \
        ' | class: 0\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 1.000' \
        ' | class: 1\n' + \
        'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) <= 0.730 | class: 1\n' + \
        'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) > 0.730 | class: 0\n'
    assert expected_rules == learner.get_rules_description()