def demo(): """ _test_filters This demo test the MissingValuesCleaner filter. The transform is set to clean any value equal to -47, replacing it with the median value of the last 10 samples, or less if there aren't 10 samples available. The output will be the 10 instances used in the transform. The first 9 are kept untouched, as they don't have any feature value of -47. The last samples has its first feature value equal to -47, so it's replaced by the median of the 9 first samples. """ opt = FileOption('FILE', 'OPT_NAME', '../datasets/covtype.csv', 'csv', False) stream = FileStream(opt, -1, 1) stream.prepare_for_use() filter = MissingValuesCleaner(-47, 'median', 10) X, y = stream.next_instance(10) X[9, 0] = -47 for i in range(10): temp = filter.partial_fit_transform([X[i].tolist()]) print(temp)
def demo(output_file=None, instances=50000): """ _test_sam_knn_prequential This demo shows how to produce a prequential evaluation. The first thing needed is a stream. For this case we use a file stream which gets its samples from the movingSquares.csv file, inside the datasets folder. Then we need to setup a classifier, which in this case is an instance of scikit-multiflow's SAMKNN. Then, optionally we create a pipeline structure, initialized on that classifier. The evaluation is then run. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream # opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False) opt = FileOption("FILE", "OPT_NAME", "../datasets/movingSquares.csv", "CSV", False) stream = FileStream(opt, -1, 1) # stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier # classifier = SGDClassifier() # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None) # classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None)) classifier = SAMKNN(n_neighbors=5, knnWeights='distance', maxSize=1000, STMSizeAdaption='maxACCApprox', useLTM=False) # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline #pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator eval = EvaluatePrequential(pretrain_size=0, max_instances=instances, batch_size=1, n_wait=100, max_time=1000, output_file=output_file, task_type='classification', show_plot=True, plot_options=['performance']) # Evaluate eval.eval(stream=stream, classifier=classifier)
def demo(): """ _test_knn This demo tests the KNN classifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the KNN classifier as well as the time to create the structure and classify max_samples (5000 by default) instances. """ opt = FileOption('FILE', 'OPT_NAME', '../datasets/sea_big.csv', 'csv', False) stream = FileStream(opt, -1, 1) stream.prepare_for_use() train = 200 X, y = stream.next_instance(train) #t = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) #t2 = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) start = timer() knn = KNN(k=8, max_window_size=2000, leaf_size=40) #pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)]) #compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean') #pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)]) #pipe.fit(X, y) #pipe2.fit(X, y) knn.partial_fit(X, y) #compare.fit(X, y) n_samples = 0 max_samples = 5000 my_corrects = 0 compare_corrects = 0 while n_samples < max_samples: X, y = stream.next_instance() #my_pred = pipe.predict(X) my_pred = knn.predict(X) #compare_pred = pipe2.predict(X) #compare_pred = compare.predict(X) if y[0] == my_pred[0]: my_corrects += 1 #if y[0] == compare_pred[0]: # compare_corrects += 1 n_samples += 1 end = timer() print('Evaluation time: ' + str(end - start)) print(str(n_samples) + ' samples analyzed.') print('My performance: ' + str(my_corrects / n_samples))
def demo_parameterized(h, dset="sea_stream.csv", show_plot=True): # Setup Stream opt = FileOption("FILE", "OPT_NAME", "../datasets/"+dset, "CSV", False) stream = FileStream(opt, -1, 1) stream.prepare_for_use() # For each classifier, e... T_init = 100 eval = EvaluatePrequential(pretrain_size=T_init, output_file='output.csv', max_instances=10000, batch_size=1, n_wait=1000, task_type='classification', show_plot=show_plot, plot_options=['performance']) eval.eval(stream=stream, classifier=h)
def test_random_rbf_generator(test_path, package_path): test_file = os.path.join(package_path, 'src/skmultiflow/datasets/sea_stream.csv') file_option = FileOption('FILE', 'sea', test_file, 'csv', False) stream = FileStream(file_option) stream.prepare_for_use() assert stream.estimated_remaining_instances() == 40000 expected_header = ['attrib1', 'attrib2', 'attrib3'] assert stream.get_attributes_header() == expected_header expected_classes = [0, 1] assert stream.get_classes() == expected_classes assert stream.get_classes_header() == ['class'] assert stream.get_num_attributes() == 3 assert stream.get_num_nominal_attributes() == 0 assert stream.get_num_numerical_attributes() == 3 assert stream.get_num_targets() == 1 assert stream.get_num_values_per_nominal_attribute() == 0 assert stream.get_plot_name() == 'sea_stream.csv - 2 class labels' assert stream.has_more_instances() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_instance() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.get_last_instance() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_instance(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected)
def demo(): """ _test_mol This demo tests the MOL learner on a file stream, which reads from the music.csv file. The test computes the performance of the MOL learner as well as the time to create the structure and classify all the samples in the file. """ # Setup logging logging.basicConfig(format='%(message)s', level=logging.INFO) # Setup the file stream opt = FileOption("FILE", "OPT_NAME", "../datasets/music.csv", "CSV", False) stream = FileStream(opt, 0, 6) stream.prepare_for_use() # Setup the classifier, by default it uses Logistic Regression #classifier = MultiOutputLearner() #classifier = MultiOutputLearner(h=SGDClassifier(n_iter=100)) classifier = MultiOutputLearner(h=Perceptron()) # Setup the pipeline pipe = Pipeline([('classifier', classifier)]) pretrain_size = 150 logging.info('Pre training on %s samples', str(pretrain_size)) X, y = stream.next_instance(pretrain_size) #classifier.fit(X, y) pipe.partial_fit(X, y, classes=stream.get_classes()) count = 0 true_labels = [] predicts = [] init_time = timer() logging.info('Evaluating...') while stream.has_more_instances(): X, y = stream.next_instance() #p = classifier.predict(X) p = pipe.predict(X) predicts.extend(p) true_labels.extend(y) count += 1 perf = hamming_score(true_labels, predicts) logging.info('Evaluation time: %s s', str(timer() - init_time)) logging.info('Total samples analyzed: %s', str(count)) logging.info('The classifier\'s static Hamming score : %0.3f' % perf)
def demo(): # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD) h = HoeffdingTree() # Setup Stream opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_stream.csv", "CSV", False) stream = FileStream(opt, -1, 1) stream.prepare_for_use() T_init = 100 eval = EvaluatePrequential(pretrain_size=T_init, output_file='output.csv', max_instances=10000, batch_size=1, n_wait=1000, task_type='classification', show_plot=True, plot_options=['performance']) eval.eval(stream=stream, classifier=h)
def demo(): """ _test_streams This demo tests if the streams are correctly generating samples. :return: """ opt = FileOption('FILE', 'OPT_NAME', '../datasets/covtype.csv', 'csv', False) stream = FileStream(opt, -1, 1) stream.prepare_for_use() rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, num_centroids=50, model_seed=32523423, instance_seed=5435, num_classes=2, num_att=10, num_drift_centroids=50) rbf_drift.prepare_for_use() sea = SEAGenerator() print('1 instance:\n') X, y = stream.next_instance() print(X) print(y) X, y = sea.next_instance() print(X) print(y) print('\n\n10 instances:\n') X, y = stream.next_instance(10) print(X) print(y) X, y = sea.next_instance(10) print(X) print(y)
def demo(): """ _test_stream_speed This demo tests the sample generation speed of the file stream. """ # Setup the stream opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False) stream = FileStream(opt, -1, 1) stream = RandomRBFGeneratorDrift() stream.prepare_for_use() # Test with RandomTreeGenerator #opt_list = [['-c', '2'], ['-o', '0'], ['-u', '5'], ['-v', '4']] #stream = RandomTreeGenerator(opt_list) #stream.prepare_for_use() # Setup the evaluator eval = EvaluateStreamGenerationSpeed(100000, float("inf"), None, 5) # Evaluate eval.eval(stream)
def demo(): """ _test_kdtree_compare This demo compares creation and query speed for different kd tree implementations. They are fed with instances from the covtype dataset. Three kd tree implementations are compared: SciPy's KDTree, NumPy's KDTree and scikit-multiflow's KDTree. For each of them the demo will time the construction of the tree on 1000 instances, and then measure the time to query 100 instances. The results are displayed in the terminal. """ warnings.filterwarnings("ignore", ".*Passing 1d.*") opt = FileOption('FILE', 'OPT_NAME', '../datasets/covtype.csv', 'csv', False) stream = FileStream(opt, -1, 1) stream.prepare_for_use() filter = OneHotToCategorical([[10, 11, 12, 13], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) X, y = stream.next_instance(1000) X = filter.transform(X) #print(X) X_find, y = stream.next_instance(100) X_find = filter.transform(X_find) print(X_find[4]) # Normal kdtree start = timer() scipy = spatial.KDTree(X, leafsize=40) end = timer() print("\nScipy KDTree construction time: " + str(end-start)) start = timer() for i in range(10): ind = scipy.query(X_find[i], 8) #print(ind) end = timer() print("Scipy KDTree query time: " + str(end - start)) del scipy # Fast kdtree start = timer() opt = KDTree(X, metric='euclidean', return_distance=True) end = timer() print("\nOptimal KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = opt.query(X_find[i], 8) #print(ind) #print(dist) end = timer() print("Optimal KDTree query time: " + str(end - start)) del opt # Sklearn kdtree start = timer() sk = ng.KDTree(X, metric='euclidean') end = timer() print("\nSklearn KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True) #print(ind) #print(dist) end = timer() print("Sklearn KDTree query time: " + str(end - start) + "\n") del sk
from skmultiflow.classification.lazy.knn_adwin import KNN from skmultiflow.classification.trees.hoeffding_tree import HoeffdingTree from skmultiflow.data.file_stream import FileStream from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential from skmultiflow.options.file_option import FileOption from my_classifier import BatchClassifier dataset = "elec" # 1. Create a stream opt = FileOption("FILE", "OPT_NAME", "./data/" + dataset + ".csv", "CSV", False) stream = FileStream(opt, -1, 1) # 2. Prepare for use stream.prepare_for_use() # 2. Instantiate the HoeffdingTree classifier h = [ KNN(k=10, max_window_size=100, leaf_size=30), HoeffdingTree(), BatchClassifier(window_size=100, max_models=10), ] # 3. Setup the evaluator eval = EvaluatePrequential(pretrain_size=1000, output_file='result_' + dataset + '.csv', max_instances=10000, batch_size=1, n_wait=500, max_time=1000000000, task_type='classification', show_plot=True,
def demo(): """ _test_knn_adwin This demo tests the KNNAdwin classifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the KNNAdwin classifier as well as the time to create the structure and classify max_samples (10000 by default) instances. """ start = timer() logging.basicConfig(format='%(message)s', level=logging.INFO) #warnings.filterwarnings("ignore", ".*Passing 1d.*") opt = FileOption('FILE', 'OPT_NAME', '../datasets/sea_big.csv', 'csv', False) stream = FileStream(opt, -1, 1) #stream = RandomRBFGeneratorDrift(change_speed=41.00, num_centroids=50, model_seed=32523423, instance_seed=5435, # num_classes=2, num_att=10, num_drift_centroids=50) stream.prepare_for_use() t = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) t2 = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) #knn = KNN(k=8, max_window_size=2000, leaf_size=40) knn = KNNAdwin(k=8, leaf_size=40, max_window_size=2000) #pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)]) compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean') #pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)]) first = True train = 200 if train > 0: X, y = stream.next_instance(train) #pipe.partial_fit(X, y, classes=stream.get_classes()) #pipe.partial_fit(X, y, classes=stream.get_classes()) #pipe2.fit(X, y) knn.partial_fit(X, y, classes=stream.get_classes()) compare.fit(X, y) first = False n_samples = 0 max_samples = 10000 my_corrects = 0 compare_corrects = 0 while n_samples < max_samples: if n_samples % (max_samples / 20) == 0: logging.info('%s%%', str((n_samples // (max_samples / 20) * 5))) X, y = stream.next_instance() #my_pred = pipe.predict(X) my_pred = knn.predict(X) #my_pred = [1] if first: #pipe.partial_fit(X, y, classes=stream.get_classes()) #pipe.partial_fit(X, y, classes=stream.get_classes()) knn.partial_fit(X, y, classes=stream.get_classes()) first = False else: #pipe.partial_fit(X, y) knn.partial_fit(X, y) #compare_pred = pipe2.predict(X) compare_pred = compare.predict(X) if y[0] == my_pred[0]: my_corrects += 1 if y[0] == compare_pred[0]: compare_corrects += 1 n_samples += 1 end = timer() print('Evaluation time: ' + str(end - start)) print(str(n_samples) + ' samples analyzed.') print('My performance: ' + str(my_corrects / n_samples)) print('Compare performance: ' + str(compare_corrects / n_samples))
eval.eval(stream=stream, classifier=adf) # # # Eval Prequential with datasets.csv for ARF # # In[47]: from skmultiflow.options.file_option import FileOption from skmultiflow.data.file_stream import FileStream from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential # 1. Create a stream #options = FileOption(option_value="../datasets/covtype.csv", file_extension="CSV") #options = FileOption(option_value="../datasets/movingSquares.csv", file_extension="CSV") options = FileOption(option_value="../datasets/sea_stream.csv", file_extension="CSV") stream = FileStream(options) stream.prepare_for_use() # 2. Instantiate the classifier adf = AdaptiveRandomForest() # 3. Setup the evaluator eval = EvaluatePrequential(pretrain_size=1000, max_instances=100000, batch_size=1, max_time=1000, output_file='resultsPrequential.csv', task_type='classification',
def demo(instances=2000): """ _test_comparison_prequential This demo will test a prequential evaluation when more than one learner is passed, which makes it a comparison task. Parameters ---------- instances: int The evaluation's maximum number of instances. """ # Stream setup opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False) #opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_big.csv", "CSV", False) stream = FileStream(opt, -1, 1) #stream = SEAGenerator(classification_function=2, instance_seed=53432, balance_classes=False) stream.prepare_for_use() # Setup the classifier clf = SGDClassifier() # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None) # classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None)) clf_one = KNNAdwin(k=8, max_window_size=1000, leaf_size=30) #clf_two = KNN(k=8, max_window_size=1000, leaf_size=30) #clf_two = LeverageBagging(h=KNN(), ensemble_length=2) t_one = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) #t_two = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)]) #pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)]) classifier = [clf, pipe_one] # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline #pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator eval = EvaluatePrequential(pretrain_size=2000, output_file='teste.csv', max_instances=instances, batch_size=1, n_wait=200, max_time=1000, task_type='classification', show_plot=True, plot_options=['performance', 'kappa_t']) # Evaluate eval.eval(stream=stream, classifier=classifier)
# # # Eval Holdout with datasets.csv for ARF # # In[ ]: from skmultiflow.options.file_option import FileOption from skmultiflow.data.file_stream import FileStream from skmultiflow.evaluation.evaluate_holdout import EvaluateHoldout # 1. Create a stream #options = FileOption(option_value="../datasets/covtype.csv", file_extension="CSV") options = FileOption(option_value="../datasets/movingSquares.csv", file_extension="CSV") #options = FileOption(option_value="../datasets/sea_stream.csv", file_extension="CSV") stream = FileStream(options) stream.prepare_for_use() # 2. Instantiate the classifier adf = AdaptiveRandomForest() # 3. Setup the evaluator eval = EvaluateHoldout(pretrain_size=200, max_instances=10000, batch_size=1, max_time=1000, output_file='resultsHoldout.csv', task_type='classification', show_plot=True, plot_options=['kappa', 'performance'], test_size=5000, dynamic_test_set=True) # 4. Run evaluation eval.eval(stream=stream, classifier=adf)
from skmultiflow.classification.trees.hoeffding_adaptive_tree import HoeffdingAdaptiveTree from skmultiflow.data.file_stream import FileStream from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential from skmultiflow.options.file_option import FileOption dataset = "covtype" # 1. Create a stream opt = FileOption("FILE", "OPT_NAME", "skmultiflow/datasets/" + dataset + ".csv", "CSV", False) stream = FileStream(opt, -1, 1) # 2. Prepare for use stream.prepare_for_use() # 2. Instantiate the HoeffdingTree classifier h = HoeffdingAdaptiveTree() # 3. Setup the evaluator eval = EvaluatePrequential(pretrain_size=1000, output_file='result_' + dataset + '.csv', max_instances=10000, batch_size=1, n_wait=500, max_time=1000000000, task_type='classification', show_plot=True, plot_options=['performance']) # 4. Run eval.eval(stream=stream, classifier=h)