def demo(instances=2000): """ _test_comparison_prequential This demo will test a prequential evaluation when more than one learner is passed, which makes it a comparison task. Parameters ---------- instances: int The evaluation's maximum number of instances. """ # Stream setup stream = FileStream("../data/datasets/covtype.csv", -1, 1) # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False) stream.prepare_for_use() # Setup the classifier clf = SGDClassifier() # classifier = KNNAdwin(n_neighbors=8, max_window_size=2000,leaf_size=40, categorical_list=None) # classifier = OzaBaggingAdwin(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30, categorical_list=None)) clf_one = KNNAdwin(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = KNN(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = LeverageBagging(base_estimator=KNN(), n_estimators=2) t_one = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # t_two = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)]) # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)]) classifier = [clf, pipe_one] # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline # pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential( pretrain_size=2000, output_file='test_comparison_prequential.csv', max_samples=instances, batch_size=1, n_wait=200, max_time=1000, show_plot=True, metrics=['performance', 'kappa_t']) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def test_pipeline(test_path): n_categories = 5 # Load test data generated using: # RandomTreeGenerator(tree_random_state=1, sample_random_state=1, # n_cat_features=n_categories, n_num_features=0) test_file = os.path.join(test_path, 'data-one-hot.npz') data = np.load(test_file) X = data['X'] y = data['y'] stream = DataStream(data=X, y=y.astype(np.int)) # Setup transformer cat_att_idx = [[i + j for i in range(n_categories)] for j in range(0, n_categories * n_categories, n_categories) ] transformer = OneHotToCategorical(categorical_list=cat_att_idx) # Set up the classifier classifier = KNNADWINClassifier(n_neighbors=2, max_window_size=50, leaf_size=40) # Setup the pipeline pipe = Pipeline([('one-hot', transformer), ('KNNADWINClassifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(show_plot=False, pretrain_size=10, max_samples=100) # Evaluate evaluator.evaluate(stream=stream, model=pipe) metrics = evaluator.get_mean_measurements() expected_accuracy = 0.5555555555555556 assert np.isclose(expected_accuracy, metrics[0].accuracy_score()) expected_kappa = 0.11111111111111116 assert np.isclose(expected_kappa, metrics[0].kappa_score()) print(pipe.get_info()) expected_info = "Pipeline: [OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], " \ "[5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], " \ "[20, 21, 22, 23, 24]]) KNNADWINClassifier(leaf_size=40, " \ "max_window_size=50, metric='euclidean', n_neighbors=2)]" info = " ".join([line.strip() for line in pipe.get_info().split()]) assert info == expected_info
def test_pipeline(test_path): n_categories = 5 # Load test data generated using: # RandomTreeGenerator(tree_random_state=1, sample_random_state=1, # n_cat_features=n_categories, n_num_features=0) test_file = os.path.join(test_path, 'data-one-hot.npz') data = np.load(test_file) X = data['X'] y = data['y'] stream = DataStream(data=X, y=y) stream.prepare_for_use() # Setup transformer cat_att_idx = [[i + j for i in range(n_categories)] for j in range(0, n_categories * n_categories, n_categories) ] transformer = OneHotToCategorical(categorical_list=cat_att_idx) # Set up the classifier classifier = KNNAdwin(n_neighbors=2, max_window_size=50, leaf_size=40) # Setup the pipeline pipe = Pipeline([('one-hot', transformer), ('KNNAdwin', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(show_plot=False, pretrain_size=10, max_samples=100) # Evaluate evaluator.evaluate(stream=stream, model=pipe) metrics = evaluator.get_mean_measurements() expected_accuracy = 0.5555555555555556 assert np.isclose(expected_accuracy, metrics[0].get_accuracy()) expected_kappa = 0.11111111111111116 assert np.isclose(expected_kappa, metrics[0].get_kappa()) print(pipe.get_info()) expected_info = "Pipeline:\n" \ "[OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9],\n" \ " [10, 11, 12, 13, 14],\n" \ " [15, 16, 17, 18, 19],\n" \ " [20, 21, 22, 23, 24]])\n" \ "KNNAdwin(leaf_size=40, max_window_size=50, n_neighbors=2,\n" \ " nominal_attributes=None)]" assert pipe.get_info() == expected_info
def test_pipeline(test_path): n_categories = 5 test_file = os.path.join(test_path, 'data-one-hot.npz') data = np.load(test_file) data_as_dict = [] for i in range(0, len(data['X'])): data_as_dict.append({ 'X': data['X'][i].reshape(1, 25), 'y': np.array(data['y'][i]).reshape(1, 1) }) # Setup transformer cat_att_idx = [[i + j for i in range(n_categories)] for j in range(0, n_categories * n_categories, n_categories) ] transformer = OneHotToCategorical(categorical_list=cat_att_idx) # Set up the classifier classifier = KNNADWINClassifier(n_neighbors=2, max_window_size=50, leaf_size=40) # Setup the pipeline pipe = Pipeline([('one-hot', transformer), ('KNNADWINClassifier', classifier)]) train_eval_trigger = PrequentialTrigger(10) reporter = BufferedMetricsReporter(retrieve_metrics) results_observer = MetricsResultObserver(ClassificationMeasurements(), reporter) evaluation_event_observer = EvaluationEventObserver( pipe, train_eval_trigger, [results_observer], [0, 1]) data_source = ArrayDataSource(record_to_dictionary, [evaluation_event_observer], data_as_dict) data_source.listen_for_events() time.sleep(3) expected_accuracy = 0.5555555555555556 expected_kappa = 0.11111111111111116 assert np.isclose(expected_accuracy, reporter.get_buffer()['accuracy']) assert np.isclose(expected_kappa, reporter.get_buffer()['kappa'])
def demo(): """ _test_knn_adwin This demo tests the KNNAdwin classifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the KNNAdwin classifier as well as the time to create the structure and classify max_samples (10000 by default) instances. """ start = timer() logging.basicConfig(format='%(message)s', level=logging.INFO) # warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = FileStream('../data/datasets/sea_big.csv', -1, 1) # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423, # sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50) stream.prepare_for_use() t = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) t2 = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40) knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000) # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)]) compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean') # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)]) first = True train = 200 if train > 0: X, y = stream.next_sample(train) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe2.fit(X, y) knn.partial_fit(X, y, classes=stream.target_values) compare.fit(X, y) first = False n_samples = 0 max_samples = 10000 my_corrects = 0 compare_corrects = 0 while n_samples < max_samples: if n_samples % (max_samples / 20) == 0: logging.info('%s%%', str((n_samples // (max_samples / 20) * 5))) X, y = stream.next_sample() # my_pred = pipe.predict(X) my_pred = knn.predict(X) # my_pred = [1] if first: # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) knn.partial_fit(X, y, classes=stream.target_values) first = False else: # pipe.partial_fit(X, y) knn.partial_fit(X, y) # compare_pred = pipe2.predict(X) compare_pred = compare.predict(X) if y[0] == my_pred[0]: my_corrects += 1 if y[0] == compare_pred[0]: compare_corrects += 1 n_samples += 1 end = timer() print('Evaluation time: ' + str(end - start)) print(str(n_samples) + ' samples analyzed.') print('My performance: ' + str(my_corrects / n_samples)) print('Compare performance: ' + str(compare_corrects / n_samples))
def demo(instances=2000): """ _test_comparison_prequential This demo will test a prequential evaluation when more than one learner is passed, which makes it a comparison task. Parameters ---------- instances: int The evaluation's maximum number of instances. """ # Stream setup stream = FileStream( "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/covtype.csv") # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False) # Setup the classifier clf = SGDClassifier() # classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None) # classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000, # leaf_size=30)) clf_one = KNNADWINClassifier(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = KNNClassifier(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = LeveragingBaggingClassifier(base_estimator=KNNClassifier(), n_estimators=2) t_one = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # t_two = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, # 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, # 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNNClassifier', clf_one)]) # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNNClassifier', clf_two)]) classifier = [clf, pipe_one] # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline # pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential( pretrain_size=2000, output_file='test_comparison_prequential.csv', max_samples=instances, batch_size=1, n_wait=200, max_time=1000, show_plot=True) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def demo(): """ _test_kdtree_compare This demo compares creation and query speed for different kd tree implementations. They are fed with instances from the covtype dataset. Three kd tree implementations are compared: SciPy's KDTree, NumPy's KDTree and scikit-multiflow's KDTree. For each of them the demo will time the construction of the tree on 1000 instances, and then measure the time to query 100 instances. The results are displayed in the terminal. """ warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = FileStream('../data/datasets/covtype.csv', -1, 1) filter = OneHotToCategorical([[10, 11, 12, 13], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) X, y = stream.next_sample(1000) X = filter.transform(X) # print(X) X_find, y = stream.next_sample(100) X_find = filter.transform(X_find) print(X_find[4]) # Normal kdtree start = timer() scipy = spatial.KDTree(X, leafsize=40) end = timer() print("\nScipy KDTree construction time: " + str(end-start)) start = timer() for i in range(10): ind = scipy.query(X_find[i], 8) # print(ind) end = timer() print("Scipy KDTree query time: " + str(end - start)) del scipy # Fast kdtree start = timer() opt = KDTree(X, metric='euclidean', return_distance=True) end = timer() print("\nOptimal KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = opt.query(X_find[i], 8) # print(ind) # print(dist) end = timer() print("Optimal KDTree query time: " + str(end - start)) del opt # Sklearn kdtree start = timer() sk = ng.KDTree(X, metric='euclidean') end = timer() print("\nSklearn KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True) # print(ind) # print(dist) end = timer() print("Sklearn KDTree query time: " + str(end - start) + "\n") del sk