def test_evaluate_classification_coverage(tmpdir): # A simple coverage test. Tests for metrics are placed in the corresponding test module. stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) # Learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) max_samples = 1000 output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = [ 'accuracy', 'kappa', 'kappa_t', 'kappa_m', 'f1', 'precision', 'recall', 'gmean', 'true_vs_predicted' ] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate evaluator.evaluate(stream=stream, model=learner) mean_performance, current_performance = evaluator.get_measurements( model_idx=0) expected_current_accuracy = 0.685 assert np.isclose(current_performance.accuracy_score(), expected_current_accuracy)
def demo(output_file=None, instances=40000): """ _test_regression This demo demonstrates how to evaluate a regressor. The data stream used is an instance of the RegressionGenerator, which feeds an instance from sklearn's SGDRegressor. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ stream = RegressionGenerator(n_samples=40000) regressor = HoeffdingTreeRegressor() # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=1, max_samples=instances, batch_size=1, n_wait=200, max_time=1000, output_file=output_file, show_plot=False, metrics=['mean_square_error']) # Evaluate evaluator.evaluate(stream=stream, model=regressor)
def test_evaluate_multi_target_regression_coverage(tmpdir): from skmultiflow.data import RegressionGenerator from skmultiflow.trees import iSOUPTreeRegressor max_samples = 1000 # Stream stream = RegressionGenerator(n_samples=max_samples, n_features=20, n_informative=15, random_state=1, n_targets=7) # Learner mtrht = iSOUPTreeRegressor(leaf_prediction='adaptive') output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = [ 'average_mean_square_error', 'average_mean_absolute_error', 'average_root_mean_square_error' ] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) evaluator.evaluate(stream=stream, model=mtrht, model_names=['MTRHT'])
def demo(): """ _test_pipeline This demo demonstrates the Pipeline structure seemingly working as a learner, while being passed as parameter to an EvaluatePrequential object. """ # # Setup the stream # stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" # "master/covtype.csv") # # If used for Hoeffding Trees then need to pass indices for Nominal attributes # Test with RandomTreeGenerator # stream = RandomTreeGenerator(n_classes=2, n_numerical_attributes=5) # Test with WaveformGenerator stream = WaveformGenerator() # Setup the classifier #classifier = PerceptronMask() #classifier = NaiveBayes() #classifier = PassiveAggressiveClassifier() classifier = HoeffdingTreeClassifier() # Setup the pipeline pipe = Pipeline([('Hoeffding Tree', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(show_plot=True, pretrain_size=1000, max_samples=100000) # Evaluate evaluator.evaluate(stream=stream, model=pipe)
def test_evaluate_prequential_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator max_samples = 1000 metrics = ['kappa', 'kappa_t', 'performance'] output_file = os.path.join(str(tmpdir), "prequential_summary.csv") evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'prequential_summary.csv') compare_files(output_file, expected_file)
def demo(instances=2000): """ _test_comparison_prequential This demo will test a prequential evaluation when more than one learner is passed, which makes it a comparison task. Parameters ---------- instances: int The evaluation's maximum number of instances. """ # Stream setup stream = FileStream("../data/datasets/covtype.csv", -1, 1) # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False) stream.prepare_for_use() # Setup the classifier clf = SGDClassifier() # classifier = KNNAdwin(n_neighbors=8, max_window_size=2000,leaf_size=40, categorical_list=None) # classifier = OzaBaggingAdwin(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30, categorical_list=None)) clf_one = KNNAdwin(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = KNN(n_neighbors=8, max_window_size=1000, leaf_size=30) # clf_two = LeverageBagging(base_estimator=KNN(), n_estimators=2) t_one = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # t_two = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)]) # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)]) classifier = [clf, pipe_one] # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline # pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential( pretrain_size=2000, output_file='test_comparison_prequential.csv', max_samples=instances, batch_size=1, n_wait=200, max_time=1000, show_plot=True, metrics=['performance', 'kappa_t']) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def demo(output_file=None, instances=50000): """ _test_sam_knn_prequential This demo shows how to produce a prequential evaluation. The first thing needed is a stream. For this case we use a file stream which gets its samples from the movingSquares.csv file, inside the datasets folder. Then we need to setup a classifier, which in this case is an instance of scikit-multiflow's SAMKNN. Then, optionally we create a pipeline structure, initialized on that classifier. The evaluation is then run. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream stream = FileStream("../data/datasets/movingSquares.csv", -1, 1) # stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier # classifier = SGDClassifier() # classifier = KNNAdwin(n_neighbors=8, max_window_size=2000,leaf_size=40, categorical_list=None) # classifier = OzaBaggingAdwin(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30, categorical_list=None)) classifier = SAMKNN(n_neighbors=5, weighting='distance', max_window_size=1000, stm_size_option='maxACCApprox', use_ltm=False) # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline # pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=0, max_samples=instances, batch_size=1, n_wait=100, max_time=1000, output_file=output_file, show_plot=True, metrics=['performance']) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def test_evaluate_prequential_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) # Setup evaluator max_samples = 1000 metrics = ['accuracy', 'kappa', 'kappa_t'] output_file = os.path.join(str(tmpdir), "prequential_summary.csv") evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTree) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'prequential_summary.csv') compare_files(output_file, expected_file) mean_performance, current_performance = evaluator.get_measurements(model_idx=0) expected_mean_accuracy = 0.436250 assert np.isclose(mean_performance.get_accuracy(), expected_mean_accuracy) expected_mean_kappa = 0.231791 assert np.isclose(mean_performance.get_kappa(), expected_mean_kappa) expected_mean_kappa_t = 0.236887 assert np.isclose(mean_performance.get_kappa_t(), expected_mean_kappa_t) expected_current_accuracy = 0.430000 assert np.isclose(current_performance.get_accuracy(), expected_current_accuracy) expected_current_kappa = 0.223909 assert np.isclose(current_performance.get_kappa(), expected_current_kappa) expected_current_kappa_t = 0.240000 assert np.isclose(current_performance.get_kappa_t(), expected_current_kappa_t) expected_info = "EvaluatePrequential(batch_size=1, data_points_for_classification=False,\n" \ " max_samples=1000, max_time=inf,\n" \ " metrics=['accuracy', 'kappa', 'kappa_t'], n_wait=200,\n" \ " output_file='prequential_summary.csv',\n" \ " pretrain_size=200, restart_stream=True, show_plot=False)" assert evaluator.get_info() == expected_info
def demo_parameterized(h, filename="covtype.csv", show_plot=True, model_names=None): # Setup Stream stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/" + filename) # For each classifier, e... pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_parametrized.csv', max_samples=10000, batch_size=1, n_wait=500, show_plot=show_plot) evaluator.evaluate(stream=stream, model=h, model_names=model_names)
def demo_parameterized(h, filename="covtype.csv", show_plot=True, model_names=None): # Setup Stream stream = FileStream("../data/datasets/" + filename) stream.prepare_for_use() # For each classifier, e... pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_parametrized.csv', max_samples=10000, batch_size=1, n_wait=500, show_plot=show_plot) evaluator.evaluate(stream=stream, model=h, model_names=model_names)
def demo(): # The classifier we will use (other options: SAMKNNClassifier, LeverageBaggingClassifier, SGD) h = HoeffdingTreeClassifier() # Setup Stream stream = FileStream("../data/datasets/sea_stream.csv") pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_filestream.csv', max_samples=10000, batch_size=1, n_wait=1000, show_plot=True) evaluator.evaluate(stream=stream, model=h)
def evaluate(params, stream, study_size, metrics=['accuracy', 'kappa']): clf = ARSLVQ(gamma=params[1], sigma=params[2], prototypes_per_class=int(params[3]), confidence=params[4]) stream.prepare_for_use() evaluator = EvaluatePrequential(show_plot=False, batch_size=10, max_samples=study_size, metrics=metrics) model = evaluator.evaluate(stream=stream, model=clf) print(evaluator.get_mean_measurements()) return list(params) + (evaluator._data_buffer.get_data( metric_id="accuracy", data_id="mean"))
def demo(output_file=None, instances=50000): """ _test_sam_knn_prequential This demo shows how to produce a prequential evaluation. The first thing needed is a stream. For this case we use the moving_squares.csv dataset. Then we need to setup a classifier, which in this case is an instance of scikit-multiflow's SAMKNNClassifier. Then, optionally we create a pipeline structure, initialized on that classifier. The evaluation is then run. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream stream = FileStream( "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/moving_squares.csv") # stream = WaveformGenerator() # Setup the classifier classifier = SAMKNNClassifier(n_neighbors=5, weighting='distance', max_window_size=1000, stm_size_option='maxACCApprox', use_ltm=False) # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=0, max_samples=instances, batch_size=1, n_wait=100, max_time=1000, output_file=output_file, show_plot=True) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def test_pipeline(test_path): n_categories = 5 # Load test data generated using: # RandomTreeGenerator(tree_random_state=1, sample_random_state=1, # n_cat_features=n_categories, n_num_features=0) test_file = os.path.join(test_path, 'data-one-hot.npz') data = np.load(test_file) X = data['X'] y = data['y'] stream = DataStream(data=X, y=y) stream.prepare_for_use() # Setup transformer cat_att_idx = [[i + j for i in range(n_categories)] for j in range(0, n_categories * n_categories, n_categories) ] transformer = OneHotToCategorical(categorical_list=cat_att_idx) # Set up the classifier classifier = KNNAdwin(n_neighbors=2, max_window_size=50, leaf_size=40) # Setup the pipeline pipe = Pipeline([('one-hot', transformer), ('KNNAdwin', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(show_plot=False, pretrain_size=10, max_samples=100) # Evaluate evaluator.evaluate(stream=stream, model=pipe) metrics = evaluator.get_mean_measurements() expected_accuracy = 0.5555555555555556 assert np.isclose(expected_accuracy, metrics[0].get_accuracy()) expected_kappa = 0.11111111111111116 assert np.isclose(expected_kappa, metrics[0].get_kappa()) print(pipe.get_info()) expected_info = "Pipeline:\n" \ "[OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9],\n" \ " [10, 11, 12, 13, 14],\n" \ " [15, 16, 17, 18, 19],\n" \ " [20, 21, 22, 23, 24]])\n" \ "KNNAdwin(leaf_size=40, max_window_size=50, n_neighbors=2,\n" \ " nominal_attributes=None)]" assert pipe.get_info() == expected_info
def test_pipeline(test_path): n_categories = 5 # Load test data generated using: # RandomTreeGenerator(tree_random_state=1, sample_random_state=1, # n_cat_features=n_categories, n_num_features=0) test_file = os.path.join(test_path, 'data-one-hot.npz') data = np.load(test_file) X = data['X'] y = data['y'] stream = DataStream(data=X, y=y.astype(np.int)) # Setup transformer cat_att_idx = [[i + j for i in range(n_categories)] for j in range(0, n_categories * n_categories, n_categories) ] transformer = OneHotToCategorical(categorical_list=cat_att_idx) # Set up the classifier classifier = KNNADWINClassifier(n_neighbors=2, max_window_size=50, leaf_size=40) # Setup the pipeline pipe = Pipeline([('one-hot', transformer), ('KNNADWINClassifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(show_plot=False, pretrain_size=10, max_samples=100) # Evaluate evaluator.evaluate(stream=stream, model=pipe) metrics = evaluator.get_mean_measurements() expected_accuracy = 0.5555555555555556 assert np.isclose(expected_accuracy, metrics[0].accuracy_score()) expected_kappa = 0.11111111111111116 assert np.isclose(expected_kappa, metrics[0].kappa_score()) print(pipe.get_info()) expected_info = "Pipeline: [OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], " \ "[5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], " \ "[20, 21, 22, 23, 24]]) KNNADWINClassifier(leaf_size=40, " \ "max_window_size=50, metric='euclidean', n_neighbors=2)]" info = " ".join([line.strip() for line in pipe.get_info().split()]) assert info == expected_info
def demo(): # The classifier we will use (other options: SAMKNNClassifier, LeveragingBaggingClassifier, SGD) h = HoeffdingTreeClassifier() # Setup Stream stream = FileStream( "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/sea_stream.csv") pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_filestream.csv', max_samples=10000, batch_size=1, n_wait=1000, show_plot=True) evaluator.evaluate(stream=stream, model=h)
def test_evaluate_classification_metrics(): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTree(nominal_attributes=nominal_attr_idx) max_samples = 1000 metrics = ['f1', 'precision', 'recall', 'gmean'] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics) # Evaluate evaluator.evaluate(stream=stream, model=learner) mean_performance, current_performance = evaluator.get_measurements(model_idx=0) expected_current_f1_score = 0.7096774193548387 expected_current_precision = 0.6814159292035398 expected_current_recall = 0.7403846153846154 expected_current_g_mean = 0.6802502367624613 expected_mean_f1_score = 0.7009803921568628 expected_mean_precision = 0.7185929648241206 expected_mean_recall = 0.6842105263157895 expected_mean_g_mean = 0.6954166367760247 print(mean_performance.get_g_mean()) print(mean_performance.get_recall()) print(mean_performance.get_precision()) print(mean_performance.get_f1_score()) print(current_performance.get_g_mean()) print(current_performance.get_recall()) print(current_performance.get_precision()) print(current_performance.get_f1_score()) assert np.isclose(current_performance.get_f1_score(), expected_current_f1_score) assert np.isclose(current_performance.get_precision(), expected_current_precision) assert np.isclose(current_performance.get_recall(), expected_current_recall) assert np.isclose(current_performance.get_g_mean(), expected_current_g_mean) assert np.isclose(mean_performance.get_f1_score(), expected_mean_f1_score) assert np.isclose(mean_performance.get_precision(), expected_mean_precision) assert np.isclose(mean_performance.get_recall(), expected_mean_recall) assert np.isclose(mean_performance.get_g_mean(), expected_mean_g_mean)
def demo(output_file=None, instances=40000): """ _test_prequential_bagging This demo shows the evaluation process of a LeverageBaggingClassifier, initialized with different base estimators. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream # stream = FileStream("../data/datasets/sea_big.csv", -1, 1) #stream = SEAGenerator(classification_function=2, noise_percentage=0.0) #stream.prepare_for_use() stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier #classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000, # leaf_size=30)) #classifier = LeverageBaggingClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000, # leaf_size=30), # n_estimators=1) pipe = LeverageBaggingClassifier(base_estimator=HoeffdingTreeClassifier(), n_estimators=2) # Setup the pipeline #pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=2000, max_samples=instances, output_file=output_file, show_plot=False) # Evaluate evaluator.evaluate(stream=stream, model=pipe)
def test_evaluate_coverage(tmpdir): from skmultiflow.data import SEAGenerator from skmultiflow.bayes import NaiveBayes max_samples = 1000 # Stream stream = SEAGenerator(random_state=1) # Learner nb = NaiveBayes() output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = ['running_time', 'model_size'] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, data_points_for_classification=True, output_file=output_file) evaluator.evaluate(stream=stream, model=nb, model_names=['NB'])
def test_evaluate_multi_target_classification_coverage(tmpdir): # A simple coverage test. Tests for metrics are placed in the corresponding test module. from skmultiflow.data import MultilabelGenerator from skmultiflow.meta import MultiOutputLearner max_samples = 1000 # Stream stream = MultilabelGenerator(n_samples=max_samples, random_state=1) # Learner mol = MultiOutputLearner() output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = ['hamming_score', 'hamming_loss', 'exact_match', 'j_index'] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) evaluator.evaluate(stream=stream, model=[mol], model_names=['MOL'])
def test_evaluate_regression_coverage(tmpdir): # A simple coverage test. Tests for metrics are placed in the corresponding test module. from skmultiflow.data import RegressionGenerator from skmultiflow.trees import HoeffdingTreeRegressor max_samples = 1000 # Stream stream = RegressionGenerator(n_samples=max_samples) # Learner htr = HoeffdingTreeRegressor() output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = ['mean_square_error', 'mean_absolute_error'] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) evaluator.evaluate(stream=stream, model=htr, model_names=['HTR'])
def train(name, clusters, window, normalize=False): input_csv = '{}{}_clusters={}_window={}_prepared.csv'.format( DATA_LOCATION, name, clusters, window) data = pd.read_csv(input_csv, index_col=0) if normalize: states = data.filter(['current_state', 'next_state']) sensors = data.drop(columns=['current_state', 'next_state']) scaler = StandardScaler() data = pd.DataFrame(data=scaler.fit_transform(X=sensors), index=data.index, columns=sensors.columns) data = pd.concat([data, states], axis='columns') stream = DataStream(data) hf = HoeffdingTreeClassifier() sgd = SGDClassifier() evaluator = EvaluatePrequential() evaluator.evaluate(stream=stream, model=[hf, sgd]) # print('---------------------------------------------') # measurements = evaluator.get_mean_measurements()[0] # print(measurements.confusion_matrix) # print(measurements.accuracy_score()) data = [] for i, measurements in enumerate(evaluator.get_mean_measurements()): data.append([ name, clusters, window, MODEL_NAMES[i], normalize, measurements.accuracy_score(), measurements.precision_score(), measurements.recall_score(), measurements.f1_score() ]) return pd.DataFrame(data=data, columns=[ 'name', 'clusters', 'window', 'model', 'normalized', 'accuracy', 'precision', 'recall', 'f1' ])
def test_data_stream(test_path): test_file = os.path.join(test_path, 'data/data_n30000.csv') raw_data = pd.read_csv(test_file) stream = DataStream(raw_data, name='Test') normal_knn_learner = KNNClassifier( n_neighbors=8, max_window_size=2000, leaf_size=40, ) weighted_knn_learner = WeightedKNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40) standardize_knn_learner = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40, standardize=True) nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] hoeffding_learner = HoeffdingTreeClassifier( nominal_attributes=nominal_attr_idx) nb_learner = NaiveBayes() metrics = ['accuracy', 'kappa_m', 'kappa_t', 'recall'] output_file = os.path.join(test_path, 'data/kkn_output.csv') evaluator = EvaluatePrequential(metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=[ normal_knn_learner, weighted_knn_learner, standardize_knn_learner, hoeffding_learner, nb_learner, ]) mean_performance, current_performance = evaluator.get_measurements() assert 1 == 1
from skmultiflow.data import FileStream from skmultiflow.evaluation import EvaluatePrequential from skmultiflow.bayes import NaiveBayes from skmultiflow.meta import OzaBagging from sklearn.datasets import make_classification with open("dataset_imb.csv", "w") as f: X, y = make_classification( n_features=10, n_informative=10, n_redundant=0, n_samples=10000, weights=[0.5] ) for i in range(X.shape[0]): for att in X[i]: f.write(str(att) + ",") f.write(str(y[i]) + "\n") generator = FileStream("dataset_imb.csv") dpdes = DPDESMethod(NaiveBayes(), 200, 10, KNORAU()) ozabag = OzaBagging(NaiveBayes(), n_estimators=10) evaluator = EvaluatePrequential( max_samples=10000, n_wait=200, batch_size=200, pretrain_size=0, metrics=["precision"], ) evaluator.evaluate(generator, [dpdes, ozabag], ["DPDES", "Ozabag"])
ds = args.dataset ds = ds.replace("final_800_", "") ds = ds.replace(".pickle", "") #ds = ds.replace("_", " ") nama_model = nama_model+" ("+ds+")" stream_wave = MFCCStream('dataset/'+test_dataset,nama_model=nama_model,additional_data= testDataset) classifier = CMGMMClassifier( classes=stream_wave.get_target_values(),prune_component=prune_comp,drift_detector=detector) classifier.train(train_dataset,'label','mfcc') eval = EvaluatePrequential(show_plot=True, pretrain_size=0, batch_size=1, metrics=['accuracy', 'f1','running_time'], output_file=result_dir+file_name, #data_points_for_classification=True ) eval.evaluate(stream=stream_wave, model=classifier, model_names=[model_name]) print(eval._data_buffer.get_data(metric_id=constants.ACCURACY, data_id=constants.MEAN)[0]) print((eval.model[0].adaptasi)) ''' stream = FileStream('dataset/poker.csv') classifier = SGDClassifier() eval = EvaluatePrequential(show_plot=True, pretrain_size=500, batch_size=200, metrics=['accuracy', 'kappa', 'running_time', 'model_size'])
f1s = [] tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) for fold, split in enumerate(cross_validation.split(X_train, y_train)): fold_train_indexes, fold_test_indexes = split fold_X_train = X_train.iloc[fold_train_indexes] fold_y_train = y_train.iloc[fold_train_indexes] fold_X_test = X_train.iloc[fold_test_indexes] fold_y_test = y_train.iloc[fold_test_indexes] if (classifier_name == 'hoeffding'): stream = DataStream(X, y.values.ravel()) stream.prepare_for_use() evaluator = EvaluatePrequential( show_plot=False, pretrain_size=200, metrics=['accuracy']) model = evaluator.evaluate( stream=stream, model=classifier)[0] model.fit(fold_X_train, fold_y_train.values.ravel()) # elif (classifier_name == 'cn2'): # model = CrossValidation( # table_from_frame(data), [CN2Learner()], k=5) else: model = classifier.fit( fold_X_train, fold_y_train.values.ravel()) y_pred = model.predict(fold_X_test) accuracies.append(accuracy_score(fold_y_test, y_pred)) precisions.append(precision_score(
from skmultiflow.data import FileStream from skmultiflow.lazy.knn import KNN from skmultiflow.evaluation import EvaluatePrequential n_neighbors = 8 max_window_size = 2000 leaf_size = 30 n_estimators = 30 show_plot = True pretrain_size = 100 max_samples = 7000 metrics = ['accuracy'] stream = FileStream('data/stream1.csv') stream.prepare_for_use() mdl = KNN(n_neighbors=n_neighbors, max_window_size=max_window_size, leaf_size=leaf_size) evaluator = EvaluatePrequential(show_plot=show_plot, pretrain_size=pretrain_size, max_samples=max_samples, metrics=metrics) evaluator.evaluate(stream=stream, model=mdl)
def flow_detection_classifier(classifier, stream): evaluator = EvaluatePrequential(show_plot=True, pretrain_size=2000, max_samples=50000) evaluator.evaluate(stream=stream, model=classifier) return evaluator
total_length = int(total_length) for data in response.iter_content(chunk_size=4096): dl += len(data) f.write(data) done = int(50 * dl / total_length) sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done))) sys.stdout.flush() data = np.load(file_name, allow_pickle=True) return data # data = download_data() #If dataset file is already downloaded data = np.load(file_name, allow_pickle=True) sam = SAMKNN() arf = HoeffdingAdaptiveTreeClassifier() stream = DataStream(data[:, 1:], data[:, 0].astype(int)) stream.prepare_for_use() evaluator = EvaluatePrequential(max_samples=10000, max_time=1000, show_plot=True, metrics=['accuracy', 'kappa']) evaluator.evaluate(stream=stream, model=[sam, arf], model_names=['Sam', 'RSLVQ'])
from skmultiflow.trees import HoeffdingTreeClassifier from skmultiflow.evaluation import EvaluatePrequential from skmultiflow.data.file_stream import FileStream import pandas as pd import numpy as np # Load the synthetic data stream dstream = FileStream('data_stream.csv') dstream.prepare_for_use() # Create the model instance ht_class = HoeffdingTreeClassifier() # perform prequential evaluation evaluate1 = EvaluatePrequential(show_plot=False, pretrain_size=400, max_samples=10000, metrics=['accuracy']) evaluate1.evaluate(stream=dstream, model=ht_class) ################################################### # Hoeffding Adaptive tree from skmultiflow.trees import HoeffdingAdaptiveTreeClassifier from skmultiflow.data import ConceptDriftStream from skmultiflow.evaluation import EvaluatePrequential from skmultiflow.evaluation import EvaluateHoldout # Simulate a sample data stream ds = ConceptDriftStream(random_state=777, position=30000) ds # Output: