def demo(instances=2000):
    """ _test_comparison_prequential
    
    This demo will test a prequential evaluation when more than one learner is 
    passed, which makes it a comparison task.
    
    Parameters
    ----------
    instances: int
        The evaluation's maximum number of instances.
     
    """
    # Stream setup
    stream = FileStream("../data/datasets/covtype.csv", -1, 1)
    # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False)
    stream.prepare_for_use()
    # Setup the classifier
    clf = SGDClassifier()
    # classifier = KNNAdwin(n_neighbors=8, max_window_size=2000,leaf_size=40, categorical_list=None)
    # classifier = OzaBaggingAdwin(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    clf_one = KNNAdwin(n_neighbors=8, max_window_size=1000, leaf_size=30)
    # clf_two = KNN(n_neighbors=8, max_window_size=1000, leaf_size=30)
    # clf_two = LeverageBagging(base_estimator=KNN(), n_estimators=2)

    t_one = OneHotToCategorical([[10, 11, 12, 13],
                                 [
                                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                     24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                     34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                                     44, 45, 46, 47, 48, 49, 50, 51, 52, 53
                                 ]])
    # t_two = OneHotToCategorical([[10, 11, 12, 13],
    #                        [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)])
    # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)])

    classifier = [clf, pipe_one]
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    # pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=2000,
        output_file='test_comparison_prequential.csv',
        max_samples=instances,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        show_plot=True,
        metrics=['performance', 'kappa_t'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
Esempio n. 2
0
def test_pipeline(test_path):
    n_categories = 5

    # Load test data generated using:
    # RandomTreeGenerator(tree_random_state=1, sample_random_state=1,
    #                     n_cat_features=n_categories, n_num_features=0)
    test_file = os.path.join(test_path, 'data-one-hot.npz')
    data = np.load(test_file)
    X = data['X']
    y = data['y']
    stream = DataStream(data=X, y=y.astype(np.int))

    # Setup transformer
    cat_att_idx = [[i + j for i in range(n_categories)]
                   for j in range(0, n_categories * n_categories, n_categories)
                   ]
    transformer = OneHotToCategorical(categorical_list=cat_att_idx)

    # Set up the classifier
    classifier = KNNADWINClassifier(n_neighbors=2,
                                    max_window_size=50,
                                    leaf_size=40)
    # Setup the pipeline
    pipe = Pipeline([('one-hot', transformer),
                     ('KNNADWINClassifier', classifier)])
    # Setup the evaluator
    evaluator = EvaluatePrequential(show_plot=False,
                                    pretrain_size=10,
                                    max_samples=100)
    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

    metrics = evaluator.get_mean_measurements()

    expected_accuracy = 0.5555555555555556
    assert np.isclose(expected_accuracy, metrics[0].accuracy_score())

    expected_kappa = 0.11111111111111116
    assert np.isclose(expected_kappa, metrics[0].kappa_score())
    print(pipe.get_info())
    expected_info = "Pipeline: [OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], " \
                    "[5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], " \
                    "[20, 21, 22, 23, 24]]) KNNADWINClassifier(leaf_size=40, " \
                    "max_window_size=50, metric='euclidean', n_neighbors=2)]"
    info = " ".join([line.strip() for line in pipe.get_info().split()])
    assert info == expected_info
def test_pipeline(test_path):
    n_categories = 5

    # Load test data generated using:
    # RandomTreeGenerator(tree_random_state=1, sample_random_state=1,
    #                     n_cat_features=n_categories, n_num_features=0)
    test_file = os.path.join(test_path, 'data-one-hot.npz')
    data = np.load(test_file)
    X = data['X']
    y = data['y']
    stream = DataStream(data=X, y=y)
    stream.prepare_for_use()

    # Setup transformer
    cat_att_idx = [[i + j for i in range(n_categories)]
                   for j in range(0, n_categories * n_categories, n_categories)
                   ]
    transformer = OneHotToCategorical(categorical_list=cat_att_idx)

    # Set up the classifier
    classifier = KNNAdwin(n_neighbors=2, max_window_size=50, leaf_size=40)
    # Setup the pipeline
    pipe = Pipeline([('one-hot', transformer), ('KNNAdwin', classifier)])
    # Setup the evaluator
    evaluator = EvaluatePrequential(show_plot=False,
                                    pretrain_size=10,
                                    max_samples=100)
    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

    metrics = evaluator.get_mean_measurements()

    expected_accuracy = 0.5555555555555556
    assert np.isclose(expected_accuracy, metrics[0].get_accuracy())

    expected_kappa = 0.11111111111111116
    assert np.isclose(expected_kappa, metrics[0].get_kappa())
    print(pipe.get_info())
    expected_info = "Pipeline:\n" \
                    "[OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9],\n" \
                    "                                      [10, 11, 12, 13, 14],\n" \
                    "                                      [15, 16, 17, 18, 19],\n" \
                    "                                      [20, 21, 22, 23, 24]])\n" \
                    "KNNAdwin(leaf_size=40, max_window_size=50, n_neighbors=2,\n" \
                    "         nominal_attributes=None)]"
    assert pipe.get_info() == expected_info
def test_pipeline(test_path):
    n_categories = 5

    test_file = os.path.join(test_path, 'data-one-hot.npz')
    data = np.load(test_file)

    data_as_dict = []
    for i in range(0, len(data['X'])):
        data_as_dict.append({
            'X': data['X'][i].reshape(1, 25),
            'y': np.array(data['y'][i]).reshape(1, 1)
        })

    # Setup transformer
    cat_att_idx = [[i + j for i in range(n_categories)]
                   for j in range(0, n_categories * n_categories, n_categories)
                   ]
    transformer = OneHotToCategorical(categorical_list=cat_att_idx)

    # Set up the classifier
    classifier = KNNADWINClassifier(n_neighbors=2,
                                    max_window_size=50,
                                    leaf_size=40)
    # Setup the pipeline
    pipe = Pipeline([('one-hot', transformer),
                     ('KNNADWINClassifier', classifier)])

    train_eval_trigger = PrequentialTrigger(10)
    reporter = BufferedMetricsReporter(retrieve_metrics)
    results_observer = MetricsResultObserver(ClassificationMeasurements(),
                                             reporter)
    evaluation_event_observer = EvaluationEventObserver(
        pipe, train_eval_trigger, [results_observer], [0, 1])

    data_source = ArrayDataSource(record_to_dictionary,
                                  [evaluation_event_observer], data_as_dict)

    data_source.listen_for_events()
    time.sleep(3)

    expected_accuracy = 0.5555555555555556
    expected_kappa = 0.11111111111111116

    assert np.isclose(expected_accuracy, reporter.get_buffer()['accuracy'])
    assert np.isclose(expected_kappa, reporter.get_buffer()['kappa'])
def demo():
    """ _test_knn_adwin

    This demo tests the KNNAdwin classifier on a file stream, which gives 
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNNAdwin classifier as well as 
    the time to create the structure and classify max_samples (10000 by 
    default) instances.
    
    """
    start = timer()
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    # warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = FileStream('../data/datasets/sea_big.csv', -1, 1)
    # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423,
    #                                  sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50)
    stream.prepare_for_use()
    t = OneHotToCategorical([[10, 11, 12, 13],
                             [
                                 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                 47, 48, 49, 50, 51, 52, 53
                             ]])
    t2 = OneHotToCategorical([[10, 11, 12, 13],
                              [
                                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                  25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                  36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                  47, 48, 49, 50, 51, 52, 53
                              ]])

    # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40)
    knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000)
    # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)])

    compare = KNeighborsClassifier(n_neighbors=8,
                                   algorithm='kd_tree',
                                   leaf_size=40,
                                   metric='euclidean')
    # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)])
    first = True
    train = 200
    if train > 0:
        X, y = stream.next_sample(train)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe2.fit(X, y)

        knn.partial_fit(X, y, classes=stream.target_values)
        compare.fit(X, y)
        first = False
    n_samples = 0
    max_samples = 10000
    my_corrects = 0
    compare_corrects = 0

    while n_samples < max_samples:
        if n_samples % (max_samples / 20) == 0:
            logging.info('%s%%', str((n_samples // (max_samples / 20) * 5)))
        X, y = stream.next_sample()
        # my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        # my_pred = [1]
        if first:
            # pipe.partial_fit(X, y, classes=stream.target_values)
            # pipe.partial_fit(X, y, classes=stream.target_values)
            knn.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            # pipe.partial_fit(X, y)
            knn.partial_fit(X, y)
        # compare_pred = pipe2.predict(X)
        compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        if y[0] == compare_pred[0]:
            compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end - start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects / n_samples))
    print('Compare performance: ' + str(compare_corrects / n_samples))
def demo(instances=2000):
    """ _test_comparison_prequential
    
    This demo will test a prequential evaluation when more than one learner is 
    passed, which makes it a comparison task.
    
    Parameters
    ----------
    instances: int
        The evaluation's maximum number of instances.
     
    """
    # Stream setup
    stream = FileStream(
        "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
        "master/covtype.csv")
    # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False)
    # Setup the classifier
    clf = SGDClassifier()
    # classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None)
    # classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000,
    #                                                                     leaf_size=30))
    clf_one = KNNADWINClassifier(n_neighbors=8,
                                 max_window_size=1000,
                                 leaf_size=30)
    # clf_two = KNNClassifier(n_neighbors=8, max_window_size=1000, leaf_size=30)
    # clf_two = LeveragingBaggingClassifier(base_estimator=KNNClassifier(), n_estimators=2)

    t_one = OneHotToCategorical([[10, 11, 12, 13],
                                 [
                                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                     24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                     34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                                     44, 45, 46, 47, 48, 49, 50, 51, 52, 53
                                 ]])
    # t_two = OneHotToCategorical([[10, 11, 12, 13],
    #                             [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
    #                              27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    #                              40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    pipe_one = Pipeline([('one_hot_to_categorical', t_one),
                         ('KNNClassifier', clf_one)])
    # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNNClassifier', clf_two)])

    classifier = [clf, pipe_one]
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    # pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=2000,
        output_file='test_comparison_prequential.csv',
        max_samples=instances,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        show_plot=True)

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
Esempio n. 7
0
def demo():
    """ _test_kdtree_compare
    
    This demo compares creation and query speed for different kd tree 
    implementations. They are fed with instances from the covtype dataset. 
    
    Three kd tree implementations are compared: SciPy's KDTree, NumPy's 
    KDTree and scikit-multiflow's KDTree. For each of them the demo will 
    time the construction of the tree on 1000 instances, and then measure 
    the time to query 100 instances. The results are displayed in the 
    terminal.
    
    """
    warnings.filterwarnings("ignore", ".*Passing 1d.*")

    stream = FileStream('../data/datasets/covtype.csv', -1, 1)

    filter = OneHotToCategorical([[10, 11, 12, 13],
                                  [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                   34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    X, y = stream.next_sample(1000)
    X = filter.transform(X)
    # print(X)

    X_find, y = stream.next_sample(100)
    X_find = filter.transform(X_find)
    print(X_find[4])
    # Normal kdtree
    start = timer()
    scipy = spatial.KDTree(X, leafsize=40)
    end = timer()
    print("\nScipy KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(10):
        ind = scipy.query(X_find[i], 8)
        # print(ind)
    end = timer()
    print("Scipy KDTree query time: " + str(end - start))

    del scipy

    # Fast kdtree
    start = timer()
    opt = KDTree(X, metric='euclidean', return_distance=True)
    end = timer()
    print("\nOptimal KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = opt.query(X_find[i], 8)
        # print(ind)
        # print(dist)
    end = timer()
    print("Optimal KDTree query time: " + str(end - start))

    del opt

    # Sklearn kdtree
    start = timer()
    sk = ng.KDTree(X, metric='euclidean')
    end = timer()
    print("\nSklearn KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True)
        # print(ind)
        # print(dist)
    end = timer()
    print("Sklearn KDTree query time: " + str(end - start) + "\n")

    del sk