Ejemplo n.º 1
0
def test_hoeffding_tree_coverage():
    max_samples = 1000
    max_size_mb = 2

    stream = RegressionGenerator(
        n_samples=max_samples, n_features=10, n_informative=7, n_targets=3,
        random_state=42
    )
    X, y = stream.next_sample(max_samples)

    # Will generate a warning concerning the invalid leaf prediction option
    tree = StackedSingleTargetHoeffdingTreeRegressor(
        leaf_prediction='mean', grace_period=200,
        memory_estimate_period=100, max_byte_size=max_size_mb*2**20
    )

    # Trying to predict without fitting
    tree.predict(X[0])

    tree.partial_fit(X, y)

    # A tree without memory management enabled reaches over 3 MB in size
    assert calculate_object_size(tree, 'MB') <= max_size_mb

    tree = StackedSingleTargetHoeffdingTreeRegressor(
        leaf_prediction='adaptive', grace_period=200,
        memory_estimate_period=100, max_byte_size=max_size_mb*2**20,
        learning_ratio_const=False
    )
    tree.partial_fit(X, y)
    assert calculate_object_size(tree, 'MB') <= max_size_mb
Ejemplo n.º 2
0
def train_ssth():
    print("Start training Stacked Single Target Hoeffding Tree Regressor...")
    ssth_regressor = StackedSingleTargetHoeffdingTreeRegressor(
        max_byte_size=2000000000)

    start_time = time.time()

    ssth_regressor.fit(X_train, y_train)
    print("Execution ssth completed in -- %s seconds --" %
          round(time.time() - start_time, 2))
    pickle.dump(ssth_regressor, open("models/ssth_regressor.p", "wb"))
Ejemplo n.º 3
0
def test_hoeffding_tree_coverage(test_path):
    # Cover nominal attribute observer
    test_file = os.path.join(test_path, 'multi_target_regression_data.npz')
    data = np.load(test_file)
    X = data['X']
    Y = data['Y']

    # Will generate a warning concerning the invalid leaf prediction option
    learner = StackedSingleTargetHoeffdingTreeRegressor(
        leaf_prediction='mean',
        nominal_attributes=[i for i in range(3)],
        learning_ratio_const=False)
    learner.partial_fit(X, Y)
Ejemplo n.º 4
0
def test_stacked_single_target_hoeffding_tree_regressor_adaptive(test_path):
    stream = RegressionGenerator(n_samples=2000, n_features=20,
                                 n_informative=15, random_state=1,
                                 n_targets=3)

    learner = StackedSingleTargetHoeffdingTreeRegressor(
        leaf_prediction='adaptive',
        random_state=1
    )

    cnt = 0
    max_samples = 2000
    wait_samples = 200
    y_pred = np.zeros((int(max_samples / wait_samples), 3))
    y_true = np.zeros((int(max_samples / wait_samples), 3))

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred[int(cnt / wait_samples), :] = learner.predict(X)
            y_true[int(cnt / wait_samples), :] = y
        learner.partial_fit(X, y)
        cnt += 1

    test_file = os.path.join(
        test_path,
        'expected_preds_stacked_single_target_hoeffding_tree_adaptive.npy'
    )
    expected_predictions = np.load(test_file)

    assert np.allclose(y_pred, expected_predictions)
    error = mean_absolute_error(y_true, y_pred)

    expected_error = 152.8716829154756
    assert np.isclose(error, expected_error)

    expected_info = "StackedSingleTargetHoeffdingTreeRegressor(binary_split=False, grace_period=200,\n" \
                    "                                          leaf_prediction='adaptive',\n" \
                    "                                          learning_ratio_const=True,\n" \
                    "                                          learning_ratio_decay=0.001,\n" \
                    "                                          learning_ratio_perceptron=0.02,\n" \
                    "                                          max_byte_size=33554432,\n" \
                    "                                          memory_estimate_period=1000000,\n" \
                    "                                          nb_threshold=0, no_preprune=False,\n" \
                    "                                          nominal_attributes=None,\n" \
                    "                                          random_state=1,\n" \
                    "                                          remove_poor_atts=False,\n" \
                    "                                          split_confidence=1e-07,\n" \
                    "                                          stop_mem_management=False,\n" \
                    "                                          tie_threshold=0.05)"

    assert learner.get_info() == expected_info
    assert isinstance(learner.get_model_description(), type(''))
Ejemplo n.º 5
0
def test_stacked_single_target_hoeffding_tree_categorical_features(test_path):
    data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy')
    stream = np.load(data_path)
    X, y = stream[:, :-2], stream[:, -2:]

    nominal_attr_idx = np.arange(8)
    learner = StackedSingleTargetHoeffdingTreeRegressor(
        nominal_attributes=nominal_attr_idx, leaf_prediction='perceptron')

    learner.partial_fit(X, y)

    expected_description = "if Attribute 0 = -15.0:\n" \
                           "  if Attribute 3 = 0.0:\n" \
                           "    Leaf = Statistics {0: 80.0000, 1: [-192.4417, 80.0908], 2: [464.1268, 80.1882]}\n" \
                           "  if Attribute 3 = 1.0:\n" \
                           "    Leaf = Statistics {0: 77.0000, 1: [-184.8333, -7.2503], 2: [444.9068, 42.7423]}\n" \
                           "  if Attribute 3 = 2.0:\n" \
                           "    Leaf = Statistics {0: 56.0000, 1: [-134.1829, -1.0863], 2: [322.1336, 28.1218]}\n" \
                           "  if Attribute 3 = 3.0:\n" \
                           "    Leaf = Statistics {0: 62.0000, 1: [-148.2397, -17.2837], 2: [355.5327, 38.6913]}\n" \
                           "if Attribute 0 = 0.0:\n" \
                           "  Leaf = Statistics {0: 672.0000, 1: [390.6773, 672.0472], 2: [761.0744, 672.1686]}\n" \
                           "if Attribute 0 = 1.0:\n" \
                           "  Leaf = Statistics {0: 644.0000, 1: [671.3479, 174.3011], 2: [927.5194, 466.7064]}\n" \
                           "if Attribute 0 = 2.0:\n" \
                           "  Leaf = Statistics {0: 619.0000, 1: [867.2865, 320.6506], 2: [1262.0992, 435.2835]}\n" \
                           "if Attribute 0 = 3.0:\n" \
                           "  Leaf = Statistics {0: 627.0000, 1: [987.0864, 331.0822], 2: [1583.8108, 484.0456]}\n" \
                           "if Attribute 0 = -30.0:\n" \
                           "  Leaf = Statistics {0: 88.0000, 1: [-269.7967, 25.9328], 2: [828.2289, 57.6501]}\n"

    assert SequenceMatcher(None, expected_description,
                           learner.get_model_description()).ratio() > 0.9

    new_sample = X[0].copy()
    # Adding a new category on the split feature
    new_sample[0] = 7
    learner.predict([new_sample])

    # Let's do the same considering other prediction strategy
    learner = StackedSingleTargetHoeffdingTreeRegressor(
        nominal_attributes=nominal_attr_idx, leaf_prediction='adaptive')

    learner.partial_fit(X, y)
    learner.predict([new_sample])