def test_hoeffding_tree_coverage(): # Cover memory management stream = SEAGenerator(random_state=1, noise_percentage=0.05) stream.prepare_for_use() X, y = stream.next_sample(5000) learner = HoeffdingTree(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='mc') learner.partial_fit(X, y, classes=stream.target_values) learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0, n_categories_per_cat_feature=2) stream.prepare_for_use() X, y = stream.next_sample(1000) learner = HoeffdingTree(leaf_prediction='mc', nominal_attributes=[i for i in range(10)]) learner.partial_fit(X, y, classes=stream.target_values)
def demo(): """ _test_streams This demo tests if the streams are correctly generating samples. :return: """ stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/covtype.csv") rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_seed=32523423, instance_seed=5435, n_classes=2, n_features=10, num_drift_centroids=50) sea = SEAGenerator() print('1 instance:\n') X,y = stream.next_sample() print(X) print(y) X, y = sea.next_sample() print(X) print(y) print('\n\n10 instances:\n') X,y = stream.next_sample(10) print(X) print(y) X, y = sea.next_sample(10) print(X) print(y)
def test_extremely_fast_decision_tree_coverage(): # Cover memory management stream = SEAGenerator(random_state=1, noise_percentage=0.05) stream.prepare_for_use() X, y = stream.next_sample(5000) learner = ExtremelyFastDecisionTreeClassifier(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='nba') learner.partial_fit(X, y, classes=stream.target_values) learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) stream.prepare_for_use() X, y = stream.next_sample(5000) learner = ExtremelyFastDecisionTreeClassifier( leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)]) learner.partial_fit(X, y, classes=stream.target_values)
def test_accuracy(): # an ensemble of Adaptive Random Forests should perform at the very least 80% with 200 instances of SEAGenerator n_samples_train = 200 n_samples_test = 200 gen = SEAGenerator(noise_percentage=0.0) # gen.prepare_for_use() arf = AdaptiveRandomForest() desdd = DESDDMethod(arf) X_train, y_train = gen.next_sample(n_samples_train) X_test, y_test = gen.next_sample(n_samples_test) desdd.partial_fit(X_train, y_train) assert desdd.score(X_test, y_test) > 0.80
def demo(): """ _test_leverage_bagging This demo tests the LeverageBagging classifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the LeverageBagging classifier as well as the time to create the structure and classify max_samples (2000 by default) instances. """ logging.basicConfig(format='%(message)s', level=logging.INFO) warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = SEAGenerator(1, noise_percentage=0.067, random_state=1) stream.prepare_for_use() clf = LeverageBagging(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30), n_estimators=1, random_state=1) sample_count = 0 correctly_classified = 0 max_samples = 2000 train_size = 200 first = True if train_size > 0: X, y = stream.next_sample(train_size) clf.partial_fit(X, y, classes=stream.target_values) first = False logging.info('%s%%', 0.0) while sample_count < max_samples: if (sample_count + 1) % (max_samples / 20) == 0: logging.info('%s%%', str(((sample_count // (max_samples / 20) + 1) * 5))) X, y = stream.next_sample(2) my_pred = clf.predict(X) if first: clf.partial_fit(X, y, classes=stream.target_values) first = False else: clf.partial_fit(X, y) if my_pred is not None: if y[0] == my_pred[0]: correctly_classified += 1 sample_count += 1 print(str(sample_count) + ' samples analyzed.') print('My performance: ' + str(correctly_classified / sample_count)) print(clf.get_info())
def test_accuracy(): # an ensemble of Naive Bayes should perform at the very least 85% with 200 instances of SEAGenerator chunk_size = 100 n_samples_train = 1050 n_samples_test = 200 gen = SEAGenerator(noise_percentage=0.0) # gen.prepare_for_use() nb = NaiveBayes() mde = MDEMethod(nb, chunk_size, KNORAU()) X_train, y_train = gen.next_sample(n_samples_train) X_test, y_test = gen.next_sample(n_samples_test) mde.partial_fit(X_train, y_train) assert mde.score(X_test, y_test) > 0.85
def demo(): """ _test_oza_bagging_adwin This demo tests the OzaBaggingADWINClassifier using KNNADWINClassifier as base estimator on samples given by a SEAGenerator. The test computes the performance of the OzaBaggingADWINClassifier as well as the time to create the structure and classify max_samples (20000 by default) instances. """ logging.basicConfig(format='%(message)s', level=logging.INFO) warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = SEAGenerator(1, noise_percentage=0.067, random_state=1) clf = OzaBaggingADWINClassifier(base_estimator=KNNADWINClassifier( n_neighbors=8, max_window_size=2000, leaf_size=30), n_estimators=2, random_state=1) sample_count = 0 correctly_classified = 0 max_samples = 20000 train_size = 10 first = True if train_size > 0: X, y = stream.next_sample(train_size) clf.partial_fit(X, y, classes=stream.target_values) first = False while sample_count < max_samples: if sample_count % (max_samples / 20) == 0: logging.info('%s%%', str((sample_count // (max_samples / 20) * 5))) X, y = stream.next_sample() my_pred = clf.predict(X) if first: clf.partial_fit(X, y, classes=stream.target_values) first = False else: clf.partial_fit(X, y) if my_pred is not None: if y[0] == my_pred[0]: correctly_classified += 1 sample_count += 1 print(str(sample_count) + ' samples analyzed.') print('My performance: ' + str(correctly_classified / sample_count))
def test_clone(): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 cloned = clone(learner) assert learner._observed_class_distribution != {} and cloned._observed_class_distribution == {}
def run(self): producer = KafkaProducer(bootstrap_servers='localhost:9092') stream = SEAGenerator() # Send Signal to model server to extract persist model from pickle in local file producer.send( topic='testTopicAdmin', value=b'../modelPersist/online_hoeffding_tree_persist.pkl', key=b'extract') time.sleep(10) while not self.stop_event.is_set(): dummy_data, dummy_label = stream.next_sample() print("Dummy Event Generated:", str(dummy_data)) dummy_data_and_label = np.concatenate((dummy_data, dummy_label), axis=None) producer.send(topic='testTopic', value=dummy_data_and_label.tobytes(), key=b'labeled') time.sleep(1) # Send signal to model server to persist model producer.send( topic='testTopicAdmin', value=b'../modelPersist/online_hoeffding_tree_persist.pkl', key=b'flush') producer.close()
def create_sea_drift_dataset(n_samples_per_concept=200, concepts=[0, 1, 2, 3]): X_stream = [] Y_stream = [] concept_drifts = [] t = 0 gen = SEAGenerator() gen.prepare_for_use() for _ in concepts: if t != 0: concept_drifts.append(t) X, y = gen.next_sample(batch_size=n_samples_per_concept) X_stream.append(X) Y_stream.append(y) gen.generate_drift() t += n_samples_per_concept return { "data": (np.concatenate(X_stream, axis=0), np.concatenate(Y_stream, axis=0).reshape(-1, 1)), "drifts": np.array(concept_drifts) }
def test_evaluate_delayed_coverage(tmpdir): from skmultiflow.data import SEAGenerator from skmultiflow.bayes import NaiveBayes max_samples = 1000 # Stream data = SEAGenerator(random_state=1) # Get X and y X, y = data.next_sample(max_samples) time = generate_random_dates(seed=1, samples=max_samples) # Setup temporal stream stream = TemporalDataStream(X, y, time, ordered=False) # Learner nb = NaiveBayes() output_file = os.path.join(str(tmpdir), "prequential_delayed_summary.csv") metrics = ['running_time', 'model_size'] evaluator = EvaluatePrequentialDelayed(max_samples=max_samples, metrics=metrics, data_points_for_classification=True, output_file=output_file) evaluator.evaluate(stream=stream, model=nb, model_names=['NB'])
def test_half_space_trees(test_path): stream = SEAGenerator(classification_function=0, noise_percentage=0.1, random_state=1) learner = HalfSpaceTrees(n_estimators=13, size_limit=75, anomaly_threshold=0.90, depth=10, random_state=5) cnt = 0 max_samples = 5000 y_pred = array('i') y_proba = [] wait_samples = 500 while cnt < max_samples: X, y = stream.next_sample() # Scale inputs between 0 and 1 X = X / 10 if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [1, 0, 0, 0, 1, 0, 0, 1, 0]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_half_space_trees.npy') expected_proba = np.load(test_file) assert np.allclose(y_proba, expected_proba)
def test_hoeffding_tree_model_information(): stream = SEAGenerator(random_state=1, noise_percentage=0.05) stream.prepare_for_use() X, y = stream.next_sample(5000) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=stream.target_values) expected_info = { 'Tree size (nodes)': 5, 'Tree size (leaves)': 3, 'Active learning nodes': 3, 'Tree depth': 2, 'Active leaf byte size estimate': 0.0, 'Inactive leaf byte size estimate': 0.0, 'Byte size estimate overhead': 1.0 } observed_info = learner.get_model_measurements for k in expected_info: assert k in observed_info assert expected_info[k] == observed_info[k] expected_description = "if Attribute 0 <= 4.549969620513424:\n" \ " if Attribute 1 <= 5.440182925299016:\n" \ " Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \ " if Attribute 1 > 5.440182925299016:\n" \ " Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \ "if Attribute 0 > 4.549969620513424:\n" \ " Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \ assert expected_description == learner.get_model_description()
def test_knn(): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40) cnt = 0 max_samples = 5000 predictions = array('i') correct_predictions = 0 wait_samples = 100 X_batch = [] y_batch = [] while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1 ]) assert np.alltrue(predictions == expected_predictions) expected_correct_predictions = 49 assert correct_predictions == expected_correct_predictions expected_info = 'KNN(leaf_size=40, max_window_size=2000, n_neighbors=8, nominal_attributes=None)' assert learner.get_info() == expected_info learner.reset() assert learner.get_info() == expected_info X_batch = np.array(X_batch) y_batch = np.array(y_batch) learner.fit(X_batch[:4500], y_batch[:4500], classes=[0, 1]) predictions = learner.predict(X_batch[4501:4550]) expected_predictions = array('i', [ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0 ]) assert np.alltrue(predictions == expected_predictions) correct_predictions = sum(predictions == y_batch[4501:4550]) expected_correct_predictions = 49 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_perceptron(test_path): stream = SEAGenerator(random_state=1) learner = PerceptronMask(random_state=1) cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_perceptron_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = "PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, " \ "eta0=1.0, fit_intercept=True, max_iter=1000, n_iter_no_change=5, " \ "n_jobs=None, penalty=None, random_state=1, shuffle=True, tol=0.001, " \ "validation_fraction=0.1, verbose=0, warm_start=False)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info # Coverage tests learner.reset() if not sklearn_version.startswith("0.21"): learner.fit(X=np.asarray(X_batch[:4500]), y=np.asarray(y_batch[:4500], dtype=int)) else: # Root cause of failure (TypeError: an integer is required) is in the fit() method # in sklearn 0.21.0. This is a workaround until a fix is made available in sklearn learner.partial_fit(X=np.asarray(X_batch[:4500]), y=np.asarray(y_batch[:4500]), classes=stream.target_values) learner.predict(X=X_batch[4501:]) # Run for coverage assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_standardize(): stream = SEAGenerator(random_state=1) learner = WeightedKNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40, standardize=True) # # Test for Implementing moving average example_features = np.array( [np.array([[1, 2]]), np.array([[2, 4]]), np.array([[3, 9]])]) example_targets = np.array([[1], [1], [1]]) for i in range(len(example_features)): learner.partial_fit(example_features[i], example_targets[i]) moving_average = learner.get_mean assert type(moving_average) is np.ndarray assert np.alltrue(moving_average == np.array([[2, 5]])) moving_sd = learner.get_sd assert type(moving_sd) is np.ndarray assert np.alltrue(moving_sd.astype(int) == np.array([[0, 2]])) stream = SEAGenerator(random_state=1) learner = WeightedKNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40, standardize=True) cnt = 0 max_samples = 5000 predictions = array('i') correct_predictions = 0 wait_samples = 100 X_batch = [] y_batch = [] while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1 ]) assert np.alltrue(predictions == expected_predictions)
def test_ensemble_size(): # since each member of the ensemble is initialized when the number of instances reach the chunk size, the size of # the ensemble should n_samples // chunk_size chunk_size = 100 n_samples = 1050 gen = SEAGenerator() # gen.prepare_for_use() dynse = DYNSEMethod(NaiveBayes(), chunk_size, ModifiedRank()) X, y = gen.next_sample(n_samples) dynse.partial_fit(X, y) assert len(dynse.ensemble) == n_samples // chunk_size
def test_ensemble_size(): # since each member of the ensemble is initialized when the number of instances reach the chunk size, the size of # the ensemble should n_samples // chunk_size chunk_size = 100 n_samples = 1050 gen = SEAGenerator(balance_classes=True) # gen.prepare_for_use() mde = MDEMethod(NaiveBayes(), chunk_size, KNORAE(), alpha=0.0) X, y = gen.next_sample(n_samples) mde.partial_fit(X, y) assert len(mde.ensemble) == n_samples // chunk_size
def test_perceptron(test_path): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = PerceptronMask(random_state=1) cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_perceptron_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = 'PerceptronMask: - penalty: None - alpha: 0.0001 - fit_intercept: True - max_iter: 1000 ' \ '- tol: 0.001 - shuffle: True - eta0: 1.0 - warm_start: False - class_weight: None - n_jobs: 1' assert learner.get_info() == expected_info # Coverage tests learner.reset() learner.fit(X=X_batch[:4500], y=y_batch[:4500]) y_pred = learner.predict(X=X_batch[4501:]) accuracy = accuracy_score(y_true=y_batch[4501:], y_pred=y_pred) expected_accuracy = 0.8897795591182365 # assert np.isclose(expected_accuracy, accuracy) # Removed due to npn-replicable error in Travis build assert 'estimator' == learner.get_class_type() assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_naive_bayes(test_path): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_naive_bayes_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = 'NaiveBayes: nominal attributes: [] - ' assert learner.get_info() == expected_info learner.reset() learner.fit(X=np.array(X_batch[:4500]), y=np.array(y_batch[:4500])) expected_score = 0.9378757515030061 assert np.isclose( expected_score, learner.score(X=np.array(X_batch[4501:]), y=np.array(y_batch[4501:]))) assert 'estimator' == learner.get_class_type() assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_online_csb2(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() nb = NaiveBayes() learner = OnlineCSB2Classifier(base_estimator=nb, n_estimators=3, cost_positive=1, cost_negative=0.9, random_state=112) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] expected_correct_predictions = 43 expected_performance = 0.8775510204081632 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray expected_info = "OnlineCSB2Classifier(base_estimator=NaiveBayes(nominal_attributes=None), cost_negative=0.9, " \ "cost_positive=1, drift_detection=True, n_estimators=3, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def test_online_rus_1(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() nb = NaiveBayes() learner = OnlineRUSBoost(base_estimator=nb, n_estimators=3, sampling_rate=5, algorithm=1, random_state=112) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 ] expected_correct_predictions = 33 expected_performance = 0.673469387755102 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray expected_info = "OnlineRUSBoost(algorithm=1, base_estimator=NaiveBayes(nominal_attributes=None),\n" \ " drift_detection=True, n_estimators=3, random_state=112,\n" \ " sampling_rate=5)" assert learner.get_info() == expected_info
def test_leverage_bagging(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() knn = KNNClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000) learner = LeverageBaggingClassifier(base_estimator=knn, n_estimators=3, random_state=112) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] assert np.alltrue(predictions == expected_predictions) expected_performance = 0.8571428571428571 assert np.isclose(expected_performance, performance) expected_correct_predictions = 42 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray expected_info = "LeverageBaggingClassifier(base_estimator=KNNClassifier(leaf_size=40, max_window_size=2000, " \ "n_neighbors=8, nominal_attributes=None), delta=0.002, enable_code_matrix=False, " \ "leverage_algorithm='leveraging_bag', n_estimators=3, random_state=112, w=6)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def demo(): """ _test_streams This demo tests if the streams are correctly generating samples. :return: """ stream = FileStream('../data/datasets/covtype.csv', -1, 1) stream.prepare_for_use() rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_seed=32523423, instance_seed=5435, n_classes=2, n_features=10, num_drift_centroids=50) rbf_drift.prepare_for_use() sea = SEAGenerator() print('1 instance:\n') X, y = stream.next_sample() print(X) print(y) X, y = sea.next_sample() print(X) print(y) print('\n\n10 instances:\n') X, y = stream.next_sample(10) print(X) print(y) X, y = sea.next_sample(10) print(X) print(y)
def test_additive_expert_ensemble_weakest(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() learner = AdditiveExpertEnsemble(3, NaiveBayes(), beta=0.5, gamma=0.1, pruning='weakest') cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 first = True while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] expected_correct_predictions = 45 expected_performance = 0.9183673469387755 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray expected_info = "AdditiveExpertEnsemble(base_estimator=NaiveBayes(nominal_attributes=None),\n" \ " beta=0.5, gamma=0.1, n_estimators=3, pruning='weakest')" assert learner.get_info() == expected_info
def test_oza_bagging_adwin(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() knn = KNN(n_neighbors=8, leaf_size=40, max_window_size=2000) learner = OzaBaggingAdwin(base_estimator=knn, n_estimators=3, random_state=112) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] assert np.alltrue(predictions == expected_predictions) expected_performance = 0.8979591836734694 assert np.isclose(expected_performance, performance) expected_correct_predictions = 44 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray expected_info = "OzaBaggingAdwin(base_estimator=KNN(leaf_size=40, max_window_size=2000,\n" \ " n_neighbors=8, nominal_attributes=None),\n" \ " n_estimators=3, random_state=112)" assert learner.get_info() == expected_info
def test_oza_bagging(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) knn = KNNClassifier(n_neighbors=8, leaf_size=40, max_window_size=2000) learner = OzaBaggingClassifier(base_estimator=knn, n_estimators=3, random_state=112) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] assert np.alltrue(predictions == expected_predictions) expected_performance = 0.8979591836734694 assert np.isclose(expected_performance, performance) expected_correct_predictions = 44 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray expected_info = "OzaBaggingClassifier(base_estimator=KNNClassifier(leaf_size=40, " \ "max_window_size=2000, metric='euclidean', n_neighbors=8), " \ "n_estimators=3, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def test_extremely_fast_decision_tree_coverage(): # Cover memory management max_size_kb = 20 stream = SEAGenerator(random_state=1, noise_percentage=0.05) X, y = stream.next_sample(5000) # Unconstrained model has over 50 kB learner = ExtremelyFastDecisionTreeClassifier( leaf_prediction='mc', memory_estimate_period=200, max_byte_size=max_size_kb*2**10, min_samples_reevaluate=2500 ) learner.partial_fit(X, y, classes=stream.target_values) assert calculate_object_size(learner, 'kB') <= max_size_kb learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) X, y = stream.next_sample(5000) learner = ExtremelyFastDecisionTreeClassifier(leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)]) learner.partial_fit(X, y, classes=stream.target_values)
def test_dynamic_weighted_majority(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) learner = DynamicWeightedMajorityClassifier(3, NaiveBayes(), beta=0.5, theta=0.01) cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 first = True while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] expected_correct_predictions = 44 expected_performance = 0.8979591836734694 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray expected_info = 'DynamicWeightedMajorityClassifier(base_estimator=NaiveBayes(nominal_attributes=None),\n' \ ' beta=0.5, n_estimators=3, period=50,\n' \ ' theta=0.01)' assert learner.get_info() == expected_info
def test_online_adac2(): stream = SEAGenerator(1, noise_percentage=0.067, random_state=112) stream.prepare_for_use() nb = NaiveBayes() learner = OnlineAdaC2(base_estimator=nb, n_estimators=3, random_state=112, cost_positive=1, cost_negative=1) first = True cnt = 0 max_samples = 5000 predictions = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 if first: learner.partial_fit(X, y, classes=stream.target_values) first = False else: learner.partial_fit(X, y) cnt += 1 performance = correct_predictions / len(predictions) expected_predictions = [ 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1 ] expected_correct_predictions = 44 expected_performance = 0.8979591836734694 assert np.alltrue(predictions == expected_predictions) assert np.isclose(expected_performance, performance) assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray