def test_isoup_tree_categorical_features(test_path): data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy') stream = np.load(data_path) X, y = stream[:, :-2], stream[:, -2:] nominal_attr_idx = np.arange(8) learner = iSOUPTreeRegressor( nominal_attributes=nominal_attr_idx, leaf_prediction='perceptron' ) learner.partial_fit(X, y) expected_description = "if Attribute 0 = -15.0:\n" \ " if Attribute 3 = 0.0:\n" \ " Leaf = Statistics {0: 80.0000, 1: [-192.4417, 80.0908], 2: [464.1268, 80.1882]}\n" \ " if Attribute 3 = 1.0:\n" \ " Leaf = Statistics {0: 77.0000, 1: [-184.8333, -7.2503], 2: [444.9068, 42.7423]}\n" \ " if Attribute 3 = 2.0:\n" \ " Leaf = Statistics {0: 56.0000, 1: [-134.1829, -1.0863], 2: [322.1336, 28.1218]}\n" \ " if Attribute 3 = 3.0:\n" \ " Leaf = Statistics {0: 62.0000, 1: [-148.2397, -17.2837], 2: [355.5327, 38.6913]}\n" \ "if Attribute 0 = 0.0:\n" \ " Leaf = Statistics {0: 672.0000, 1: [390.6773, 672.0472], 2: [761.0744, 672.1686]}\n" \ "if Attribute 0 = 1.0:\n" \ " Leaf = Statistics {0: 644.0000, 1: [671.3479, 174.3011], 2: [927.5194, 466.7064]}\n" \ "if Attribute 0 = 2.0:\n" \ " Leaf = Statistics {0: 619.0000, 1: [867.2865, 320.6506], 2: [1262.0992, 435.2835]}\n" \ "if Attribute 0 = 3.0:\n" \ " Leaf = Statistics {0: 627.0000, 1: [987.0864, 331.0822], 2: [1583.8108, 484.0456]}\n" \ "if Attribute 0 = -30.0:\n" \ " Leaf = Statistics {0: 88.0000, 1: [-269.7967, 25.9328], 2: [828.2289, 57.6501]}\n" assert SequenceMatcher( None, expected_description, learner.get_model_description() ).ratio() > 0.9 new_sample = X[0].copy() # Adding a new category on the split feature new_sample[0] = 7 learner.predict([new_sample]) # Let's do the same considering other prediction strategy learner = iSOUPTreeRegressor( nominal_attributes=nominal_attr_idx, leaf_prediction='adaptive' ) learner.partial_fit(X, y) learner.predict([new_sample])
def test_isoup_tree_model_description(): stream = RegressionGenerator(n_samples=700, n_features=20, n_informative=15, random_state=1, n_targets=3) learner = iSOUPTreeRegressor(leaf_prediction='mean') max_samples = 700 X, y = stream.next_sample(max_samples) # Trying to predict without fitting learner.predict(X[0]) learner.partial_fit(X, y) expected_description = "if Attribute 11 <= 0.36737233297880056:\n" \ " Leaf = Statistics {0: 450.0000, 1: [-23322.8079, -30257.1616, -18740.9462], " \ "2: [22242706.1751, 29895648.2424, 18855571.7943]}\n" \ "if Attribute 11 > 0.36737233297880056:\n" \ " Leaf = Statistics {0: 250.0000, 1: [33354.8675, 32390.6094, 22886.4176], " \ "2: [15429435.6709, 17908472.4289, 10709746.1079]}\n" \ assert SequenceMatcher(None, expected_description, learner.get_model_description()).ratio() > 0.9
def demo(output_file=None): """ Test iSOUP-Tree This demo demonstrates how to evaluate a iSOUP-Tree multi-target regressor. Parameters ---------- output_file: string The name of the csv output file """ stream = RegressionGenerator(n_samples=5000, n_features=20, n_informative=15, random_state=1, n_targets=7) regressor = iSOUPTreeRegressor(leaf_prediction='adaptive') # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=1, batch_size=1, n_wait=200, max_time=1000, output_file=output_file, show_plot=False, metrics=[ 'average_mean_square_error', 'average_mean_absolute_error', 'average_root_mean_square_error' ]) # Evaluate evaluator.evaluate(stream=stream, model=regressor)
def test_evaluate_multi_target_regression_coverage(tmpdir): from skmultiflow.data import RegressionGenerator from skmultiflow.trees import iSOUPTreeRegressor max_samples = 1000 # Stream stream = RegressionGenerator(n_samples=max_samples, n_features=20, n_informative=15, random_state=1, n_targets=7) # Learner mtrht = iSOUPTreeRegressor(leaf_prediction='adaptive') output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = [ 'average_mean_square_error', 'average_mean_absolute_error', 'average_root_mean_square_error' ] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) evaluator.evaluate(stream=stream, model=mtrht, model_names=['MTRHT'])
def test_evaluate_delayed_multi_target_regression_coverage(tmpdir): from skmultiflow.data import RegressionGenerator from skmultiflow.trees import iSOUPTreeRegressor max_samples = 1000 # Stream data = RegressionGenerator(n_samples=max_samples, n_features=20, n_informative=15, random_state=1, n_targets=7) # Get X and y X, y = data.next_sample(max_samples) time = generate_random_dates(seed=1, samples=max_samples) # Setup temporal stream stream = TemporalDataStream(X, y, time, ordered=False) # Learner mtrht = iSOUPTreeRegressor(leaf_prediction='adaptive') output_file = os.path.join(str(tmpdir), "prequential_delayed_summary.csv") metrics = [ 'average_mean_square_error', 'average_mean_absolute_error', 'average_root_mean_square_error' ] evaluator = EvaluatePrequentialDelayed(max_samples=max_samples, metrics=metrics, output_file=output_file) evaluator.evaluate(stream=stream, model=mtrht, model_names=['MTRHT'])
def test_get_tags(): classifier = HoeffdingTreeClassifier() regressor = HoeffdingTreeRegressor() multi_output_regressor = iSOUPTreeRegressor() classifier_tags = classifier._get_tags() expected_tags = { 'X_types': ['2darray'], '_skip_test': False, 'allow_nan': False, 'multilabel': False, 'multioutput': False, 'multioutput_only': False, 'no_validation': False, 'non_deterministic': False, 'poor_score': False, 'requires_positive_data': False, 'stateless': False } assert classifier_tags == expected_tags regressor_tags = regressor._get_tags() expected_tags = { 'X_types': ['2darray'], '_skip_test': False, 'allow_nan': False, 'multilabel': False, 'multioutput': False, 'multioutput_only': False, 'no_validation': False, 'non_deterministic': False, 'poor_score': False, 'requires_positive_data': False, 'stateless': False } assert regressor_tags == expected_tags multi_output_regressor_tags = multi_output_regressor._get_tags() expected_tags = { 'X_types': ['2darray'], '_skip_test': False, 'allow_nan': False, 'multilabel': False, 'multioutput': True, 'multioutput_only': True, 'no_validation': False, 'non_deterministic': False, 'poor_score': False, 'requires_positive_data': False, 'stateless': False } assert multi_output_regressor_tags == expected_tags
def test_isoup_tree_coverage(): max_samples = 1000 max_size_mb = 2 stream = RegressionGenerator(n_samples=max_samples, n_features=10, n_informative=7, n_targets=3, random_state=42) # Cover memory management tree = iSOUPTreeRegressor(leaf_prediction='mean', grace_period=200, memory_estimate_period=100, max_byte_size=max_size_mb * 2**20) # Invalid split_criterion tree.split_criterion = 'ICVR' X, y = stream.next_sample(max_samples) tree.partial_fit(X, y) # A tree without memory management enabled reaches over 3 MB in size assert calculate_object_size(tree, 'MB') <= max_size_mb # Memory management in a tree with perceptron leaves (purposeful typo in leaf_prediction) tree = iSOUPTreeRegressor(leaf_prediction='PERCEPTRON', grace_period=200, memory_estimate_period=100, max_byte_size=max_size_mb * 2**20) tree.partial_fit(X, y) assert calculate_object_size(tree, 'MB') <= max_size_mb # Memory management in a tree with adaptive leaves tree = iSOUPTreeRegressor(leaf_prediction='adaptive', grace_period=200, memory_estimate_period=100, max_byte_size=max_size_mb * 2**20) tree.partial_fit(X, y) assert calculate_object_size(tree, 'MB') <= max_size_mb
def test_isoup_tree_coverage(test_path): # Cover nominal attribute observer test_file = os.path.join(test_path, 'multi_target_regression_data.npz') data = np.load(test_file) X = data['X'] Y = data['Y'] # Invalid leaf prediction option learner = iSOUPTreeRegressor(leaf_prediction='MEAN', nominal_attributes=[i for i in range(3)]) print(learner.split_criterion) # Invalid split_criterion learner.split_criterion = 'ICVR' learner.partial_fit(X, Y)
def test_isoup_tree_mean(test_path): stream = RegressionGenerator(n_samples=2000, n_features=20, n_informative=15, random_state=1, n_targets=3) stream.prepare_for_use() learner = iSOUPTreeRegressor(leaf_prediction='mean') cnt = 0 max_samples = 2000 wait_samples = 200 y_pred = np.zeros((int(max_samples / wait_samples), 3)) y_true = np.zeros((int(max_samples / wait_samples), 3)) while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred[int(cnt / wait_samples), :] = learner.predict(X) y_true[int(cnt / wait_samples), :] = y learner.partial_fit(X, y) cnt += 1 test_file = os.path.join( test_path, 'expected_preds_multi_target_regression_mean.npy') expected_predictions = np.load(test_file) assert np.allclose(y_pred, expected_predictions) error = mean_absolute_error(y_true, y_pred) expected_error = 191.2823924547882 assert np.isclose(error, expected_error) expected_info = "iSOUPTreeRegressor(binary_split=False, grace_period=200, leaf_prediction='mean', " \ "learning_ratio_const=True, learning_ratio_decay=0.001, learning_ratio_perceptron=0.02, " \ "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \ "nominal_attributes=None, random_state=None, remove_poor_atts=False, split_confidence=1e-07, " \ "stop_mem_management=False, tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info assert type(learner.predict(X)) == np.ndarray
from skmultiflow.data import RegressionGenerator, MultilabelGenerator from skmultiflow.trees import iSOUPTreeRegressor # Setup a data stream n_targets = 20 regression_stream = RegressionGenerator(n_targets=n_targets, random_state=1, n_samples=200) ml_stream = MultilabelGenerator(random_state=1, n_samples=200, n_targets=n_targets, n_features=10) # Setup iSOUP Tree Regressor isoup_tree = iSOUPTreeRegressor() # Auxiliary variables to control loop and track performance n_samples = 0 max_samples = 200 y_pred = np.zeros((max_samples, n_targets)) y_true = np.zeros((max_samples, n_targets)) # Run test-then-train loop for max_samples and while there is data while n_samples < max_samples and ml_stream.has_more_samples(): X, y = ml_stream.next_sample() y_true[n_samples] = y[0] y_pred[n_samples] = isoup_tree.predict(X)[0] isoup_tree.partial_fit(X, y) n_samples += 1
) }, "oza_ml_cc_ht": { "name": "OzaBagging (cc) / ecc - ht", "model": lambda data_stream: ClassifierChain( HoeffdingTreeClassifier(), n_targets=data_stream.n_targets ), "ensemble": lambda model, stream: OzaBaggingMLClassifier( base_estimator=model, n_estimators=10 ) }, "isoup": { "name": "iSoup-Tree", "model": lambda _: iSOUPTreeRegressor(), "ensemble": False }, } DEFAULT_DATASETS = ["enron", "mediamill", "20NG"] parser = argparse.ArgumentParser("Script to classify streams") parser.add_argument("-e", "--experiment", help="Description of the experiment") parser.add_argument("-d", "--datasets", help="List of skmultilearn datasets (including 20NG)", nargs="*", default=DEFAULT_DATASETS) parser.add_argument("-m", "--models", help="List of models to train data", nargs='*', default=SUPPORTED_MODELS.keys()) parser.add_argument("-s", "--streams", help="Path to stream", nargs='*') parser.add_argument("-S", "--streamsnames", help="Names of streams", nargs='*') parser.add_argument("-l", "--labels", type=int, help="Number of labels")