def test_isoup_tree_categorical_features(test_path):
    data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy')
    stream = np.load(data_path)
    X, y = stream[:, :-2], stream[:, -2:]

    nominal_attr_idx = np.arange(8)
    learner = iSOUPTreeRegressor(
        nominal_attributes=nominal_attr_idx,
        leaf_prediction='perceptron'
    )

    learner.partial_fit(X, y)

    expected_description = "if Attribute 0 = -15.0:\n" \
                           "  if Attribute 3 = 0.0:\n" \
                           "    Leaf = Statistics {0: 80.0000, 1: [-192.4417, 80.0908], 2: [464.1268, 80.1882]}\n" \
                           "  if Attribute 3 = 1.0:\n" \
                           "    Leaf = Statistics {0: 77.0000, 1: [-184.8333, -7.2503], 2: [444.9068, 42.7423]}\n" \
                           "  if Attribute 3 = 2.0:\n" \
                           "    Leaf = Statistics {0: 56.0000, 1: [-134.1829, -1.0863], 2: [322.1336, 28.1218]}\n" \
                           "  if Attribute 3 = 3.0:\n" \
                           "    Leaf = Statistics {0: 62.0000, 1: [-148.2397, -17.2837], 2: [355.5327, 38.6913]}\n" \
                           "if Attribute 0 = 0.0:\n" \
                           "  Leaf = Statistics {0: 672.0000, 1: [390.6773, 672.0472], 2: [761.0744, 672.1686]}\n" \
                           "if Attribute 0 = 1.0:\n" \
                           "  Leaf = Statistics {0: 644.0000, 1: [671.3479, 174.3011], 2: [927.5194, 466.7064]}\n" \
                           "if Attribute 0 = 2.0:\n" \
                           "  Leaf = Statistics {0: 619.0000, 1: [867.2865, 320.6506], 2: [1262.0992, 435.2835]}\n" \
                           "if Attribute 0 = 3.0:\n" \
                           "  Leaf = Statistics {0: 627.0000, 1: [987.0864, 331.0822], 2: [1583.8108, 484.0456]}\n" \
                           "if Attribute 0 = -30.0:\n" \
                           "  Leaf = Statistics {0: 88.0000, 1: [-269.7967, 25.9328], 2: [828.2289, 57.6501]}\n"

    assert SequenceMatcher(
        None, expected_description, learner.get_model_description()
    ).ratio() > 0.9

    new_sample = X[0].copy()
    # Adding a new category on the split feature
    new_sample[0] = 7
    learner.predict([new_sample])

    # Let's do the same considering other prediction strategy
    learner = iSOUPTreeRegressor(
        nominal_attributes=nominal_attr_idx,
        leaf_prediction='adaptive'
    )

    learner.partial_fit(X, y)
    learner.predict([new_sample])
Esempio n. 2
0
def test_isoup_tree_model_description():
    stream = RegressionGenerator(n_samples=700,
                                 n_features=20,
                                 n_informative=15,
                                 random_state=1,
                                 n_targets=3)

    learner = iSOUPTreeRegressor(leaf_prediction='mean')

    max_samples = 700
    X, y = stream.next_sample(max_samples)
    # Trying to predict without fitting
    learner.predict(X[0])

    learner.partial_fit(X, y)

    expected_description = "if Attribute 11 <= 0.36737233297880056:\n" \
                            "  Leaf = Statistics {0: 450.0000, 1: [-23322.8079, -30257.1616, -18740.9462], " \
                            "2: [22242706.1751, 29895648.2424, 18855571.7943]}\n" \
                            "if Attribute 11 > 0.36737233297880056:\n" \
                            "  Leaf = Statistics {0: 250.0000, 1: [33354.8675, 32390.6094, 22886.4176], " \
                            "2: [15429435.6709, 17908472.4289, 10709746.1079]}\n" \

    assert SequenceMatcher(None, expected_description,
                           learner.get_model_description()).ratio() > 0.9
Esempio n. 3
0
def demo(output_file=None):
    """ Test iSOUP-Tree

    This demo demonstrates how to evaluate a iSOUP-Tree multi-target regressor.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    """
    stream = RegressionGenerator(n_samples=5000,
                                 n_features=20,
                                 n_informative=15,
                                 random_state=1,
                                 n_targets=7)

    regressor = iSOUPTreeRegressor(leaf_prediction='adaptive')

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=1,
                                    batch_size=1,
                                    n_wait=200,
                                    max_time=1000,
                                    output_file=output_file,
                                    show_plot=False,
                                    metrics=[
                                        'average_mean_square_error',
                                        'average_mean_absolute_error',
                                        'average_root_mean_square_error'
                                    ])

    # Evaluate
    evaluator.evaluate(stream=stream, model=regressor)
def test_evaluate_multi_target_regression_coverage(tmpdir):
    from skmultiflow.data import RegressionGenerator
    from skmultiflow.trees import iSOUPTreeRegressor

    max_samples = 1000

    # Stream
    stream = RegressionGenerator(n_samples=max_samples,
                                 n_features=20,
                                 n_informative=15,
                                 random_state=1,
                                 n_targets=7)

    # Learner
    mtrht = iSOUPTreeRegressor(leaf_prediction='adaptive')

    output_file = os.path.join(str(tmpdir), "prequential_summary.csv")
    metrics = [
        'average_mean_square_error', 'average_mean_absolute_error',
        'average_root_mean_square_error'
    ]
    evaluator = EvaluatePrequential(max_samples=max_samples,
                                    metrics=metrics,
                                    output_file=output_file)

    evaluator.evaluate(stream=stream, model=mtrht, model_names=['MTRHT'])
Esempio n. 5
0
def test_evaluate_delayed_multi_target_regression_coverage(tmpdir):
    from skmultiflow.data import RegressionGenerator
    from skmultiflow.trees import iSOUPTreeRegressor

    max_samples = 1000

    # Stream
    data = RegressionGenerator(n_samples=max_samples,
                               n_features=20,
                               n_informative=15,
                               random_state=1,
                               n_targets=7)
    # Get X and y
    X, y = data.next_sample(max_samples)
    time = generate_random_dates(seed=1, samples=max_samples)

    # Setup temporal stream
    stream = TemporalDataStream(X, y, time, ordered=False)

    # Learner
    mtrht = iSOUPTreeRegressor(leaf_prediction='adaptive')

    output_file = os.path.join(str(tmpdir), "prequential_delayed_summary.csv")
    metrics = [
        'average_mean_square_error', 'average_mean_absolute_error',
        'average_root_mean_square_error'
    ]
    evaluator = EvaluatePrequentialDelayed(max_samples=max_samples,
                                           metrics=metrics,
                                           output_file=output_file)

    evaluator.evaluate(stream=stream, model=mtrht, model_names=['MTRHT'])
def test_get_tags():
    classifier = HoeffdingTreeClassifier()
    regressor = HoeffdingTreeRegressor()
    multi_output_regressor = iSOUPTreeRegressor()

    classifier_tags = classifier._get_tags()

    expected_tags = {
        'X_types': ['2darray'],
        '_skip_test': False,
        'allow_nan': False,
        'multilabel': False,
        'multioutput': False,
        'multioutput_only': False,
        'no_validation': False,
        'non_deterministic': False,
        'poor_score': False,
        'requires_positive_data': False,
        'stateless': False
    }
    assert classifier_tags == expected_tags

    regressor_tags = regressor._get_tags()
    expected_tags = {
        'X_types': ['2darray'],
        '_skip_test': False,
        'allow_nan': False,
        'multilabel': False,
        'multioutput': False,
        'multioutput_only': False,
        'no_validation': False,
        'non_deterministic': False,
        'poor_score': False,
        'requires_positive_data': False,
        'stateless': False
    }
    assert regressor_tags == expected_tags

    multi_output_regressor_tags = multi_output_regressor._get_tags()
    expected_tags = {
        'X_types': ['2darray'],
        '_skip_test': False,
        'allow_nan': False,
        'multilabel': False,
        'multioutput': True,
        'multioutput_only': True,
        'no_validation': False,
        'non_deterministic': False,
        'poor_score': False,
        'requires_positive_data': False,
        'stateless': False
    }
    assert multi_output_regressor_tags == expected_tags
def test_isoup_tree_coverage():
    max_samples = 1000
    max_size_mb = 2

    stream = RegressionGenerator(n_samples=max_samples,
                                 n_features=10,
                                 n_informative=7,
                                 n_targets=3,
                                 random_state=42)

    # Cover memory management
    tree = iSOUPTreeRegressor(leaf_prediction='mean',
                              grace_period=200,
                              memory_estimate_period=100,
                              max_byte_size=max_size_mb * 2**20)
    # Invalid split_criterion
    tree.split_criterion = 'ICVR'

    X, y = stream.next_sample(max_samples)
    tree.partial_fit(X, y)

    # A tree without memory management enabled reaches over 3 MB in size
    assert calculate_object_size(tree, 'MB') <= max_size_mb

    # Memory management in a tree with perceptron leaves (purposeful typo in leaf_prediction)
    tree = iSOUPTreeRegressor(leaf_prediction='PERCEPTRON',
                              grace_period=200,
                              memory_estimate_period=100,
                              max_byte_size=max_size_mb * 2**20)
    tree.partial_fit(X, y)
    assert calculate_object_size(tree, 'MB') <= max_size_mb

    # Memory management in a tree with adaptive leaves
    tree = iSOUPTreeRegressor(leaf_prediction='adaptive',
                              grace_period=200,
                              memory_estimate_period=100,
                              max_byte_size=max_size_mb * 2**20)

    tree.partial_fit(X, y)
    assert calculate_object_size(tree, 'MB') <= max_size_mb
Esempio n. 8
0
def test_isoup_tree_coverage(test_path):
    # Cover nominal attribute observer
    test_file = os.path.join(test_path, 'multi_target_regression_data.npz')
    data = np.load(test_file)
    X = data['X']
    Y = data['Y']

    # Invalid leaf prediction option
    learner = iSOUPTreeRegressor(leaf_prediction='MEAN',
                                 nominal_attributes=[i for i in range(3)])
    print(learner.split_criterion)
    # Invalid split_criterion
    learner.split_criterion = 'ICVR'
    learner.partial_fit(X, Y)
Esempio n. 9
0
def test_isoup_tree_mean(test_path):
    stream = RegressionGenerator(n_samples=2000,
                                 n_features=20,
                                 n_informative=15,
                                 random_state=1,
                                 n_targets=3)
    stream.prepare_for_use()

    learner = iSOUPTreeRegressor(leaf_prediction='mean')

    cnt = 0
    max_samples = 2000
    wait_samples = 200
    y_pred = np.zeros((int(max_samples / wait_samples), 3))
    y_true = np.zeros((int(max_samples / wait_samples), 3))

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred[int(cnt / wait_samples), :] = learner.predict(X)
            y_true[int(cnt / wait_samples), :] = y
        learner.partial_fit(X, y)
        cnt += 1

    test_file = os.path.join(
        test_path, 'expected_preds_multi_target_regression_mean.npy')
    expected_predictions = np.load(test_file)

    assert np.allclose(y_pred, expected_predictions)

    error = mean_absolute_error(y_true, y_pred)
    expected_error = 191.2823924547882
    assert np.isclose(error, expected_error)

    expected_info = "iSOUPTreeRegressor(binary_split=False, grace_period=200, leaf_prediction='mean', " \
                    "learning_ratio_const=True, learning_ratio_decay=0.001, learning_ratio_perceptron=0.02, " \
                    "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=None, random_state=None, remove_poor_atts=False, split_confidence=1e-07, " \
                    "stop_mem_management=False, tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    assert type(learner.predict(X)) == np.ndarray
from skmultiflow.data import RegressionGenerator, MultilabelGenerator
from skmultiflow.trees import iSOUPTreeRegressor

# Setup a data stream
n_targets = 20
regression_stream = RegressionGenerator(n_targets=n_targets,
                                        random_state=1,
                                        n_samples=200)

ml_stream = MultilabelGenerator(random_state=1,
                                n_samples=200,
                                n_targets=n_targets,
                                n_features=10)

# Setup iSOUP Tree Regressor
isoup_tree = iSOUPTreeRegressor()

# Auxiliary variables to control loop and track performance
n_samples = 0
max_samples = 200
y_pred = np.zeros((max_samples, n_targets))
y_true = np.zeros((max_samples, n_targets))

# Run test-then-train loop for max_samples and while there is data
while n_samples < max_samples and ml_stream.has_more_samples():
    X, y = ml_stream.next_sample()
    y_true[n_samples] = y[0]
    y_pred[n_samples] = isoup_tree.predict(X)[0]
    isoup_tree.partial_fit(X, y)
    n_samples += 1
        )
    },
    "oza_ml_cc_ht": {
        "name": "OzaBagging (cc) / ecc - ht",
        "model": lambda data_stream: ClassifierChain(
            HoeffdingTreeClassifier(),
            n_targets=data_stream.n_targets
        ),
        "ensemble": lambda model, stream: OzaBaggingMLClassifier(
            base_estimator=model,
            n_estimators=10
        )
    },
    "isoup": {
        "name": "iSoup-Tree",
        "model": lambda _: iSOUPTreeRegressor(),
        "ensemble": False
    },
}
DEFAULT_DATASETS = ["enron", "mediamill", "20NG"]

parser = argparse.ArgumentParser("Script to classify streams")
parser.add_argument("-e", "--experiment", help="Description of the experiment")
parser.add_argument("-d", "--datasets",
                    help="List of skmultilearn datasets (including 20NG)",
                    nargs="*", default=DEFAULT_DATASETS)
parser.add_argument("-m", "--models", help="List of models to train data",
                    nargs='*', default=SUPPORTED_MODELS.keys())
parser.add_argument("-s", "--streams", help="Path to stream", nargs='*')
parser.add_argument("-S", "--streamsnames", help="Names of streams", nargs='*')
parser.add_argument("-l", "--labels", type=int, help="Number of labels")