Esempio n. 1
0
def test_random_rbf_generator(test_path, package_path):
    test_file = os.path.join(package_path,
                             'src/skmultiflow/datasets/sea_stream.csv')
    file_option = FileOption('FILE', 'sea', test_file, 'csv', False)
    stream = FileStream(file_option)
    stream.prepare_for_use()

    assert stream.estimated_remaining_instances() == 40000

    expected_header = ['attrib1', 'attrib2', 'attrib3']
    assert stream.get_attributes_header() == expected_header

    expected_classes = [0, 1]
    assert stream.get_classes() == expected_classes

    assert stream.get_classes_header() == ['class']

    assert stream.get_num_attributes() == 3

    assert stream.get_num_nominal_attributes() == 0

    assert stream.get_num_numerical_attributes() == 3

    assert stream.get_num_targets() == 1

    assert stream.get_num_values_per_nominal_attribute() == 0

    assert stream.get_plot_name() == 'sea_stream.csv - 2 class labels'

    assert stream.has_more_instances() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_instance()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.get_last_instance()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_instance(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)
Esempio n. 2
0
def demo():
    """ _test_mol

    This demo tests the MOL learner on a file stream, which reads from 
    the music.csv file.

    The test computes the performance of the MOL learner as well as 
    the time to create the structure and classify all the samples in 
    the file.

    """
    # Setup logging
    logging.basicConfig(format='%(message)s', level=logging.INFO)

    # Setup the file stream
    opt = FileOption("FILE", "OPT_NAME", "../datasets/music.csv", "CSV", False)
    stream = FileStream(opt, 0, 6)
    stream.prepare_for_use()

    # Setup the classifier, by default it uses Logistic Regression
    #classifier = MultiOutputLearner()
    #classifier = MultiOutputLearner(h=SGDClassifier(n_iter=100))
    classifier = MultiOutputLearner(h=Perceptron())

    # Setup the pipeline
    pipe = Pipeline([('classifier', classifier)])

    pretrain_size = 150
    logging.info('Pre training on %s samples', str(pretrain_size))
    X, y = stream.next_instance(pretrain_size)
    #classifier.fit(X, y)
    pipe.partial_fit(X, y, classes=stream.get_classes())
    count = 0
    true_labels = []
    predicts = []
    init_time = timer()
    logging.info('Evaluating...')
    while stream.has_more_instances():
        X, y = stream.next_instance()
        #p = classifier.predict(X)
        p = pipe.predict(X)
        predicts.extend(p)
        true_labels.extend(y)
        count += 1
    perf = hamming_score(true_labels, predicts)
    logging.info('Evaluation time: %s s', str(timer() - init_time))
    logging.info('Total samples analyzed: %s', str(count))
    logging.info('The classifier\'s static Hamming score    : %0.3f' % perf)
def demo():
    """ _test_knn_adwin

    This demo tests the KNNAdwin classifier on a file stream, which gives 
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNNAdwin classifier as well as 
    the time to create the structure and classify max_samples (10000 by 
    default) instances.
    
    """
    start = timer()
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    #warnings.filterwarnings("ignore", ".*Passing 1d.*")
    opt = FileOption('FILE', 'OPT_NAME', '../datasets/sea_big.csv', 'csv',
                     False)
    stream = FileStream(opt, -1, 1)
    #stream = RandomRBFGeneratorDrift(change_speed=41.00, num_centroids=50, model_seed=32523423, instance_seed=5435,
    #                                 num_classes=2, num_att=10, num_drift_centroids=50)
    stream.prepare_for_use()
    t = OneHotToCategorical([[10, 11, 12, 13],
                             [
                                 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                 47, 48, 49, 50, 51, 52, 53
                             ]])
    t2 = OneHotToCategorical([[10, 11, 12, 13],
                              [
                                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                  25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                  36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                  47, 48, 49, 50, 51, 52, 53
                              ]])

    #knn = KNN(k=8, max_window_size=2000, leaf_size=40)
    knn = KNNAdwin(k=8, leaf_size=40, max_window_size=2000)
    #pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)])

    compare = KNeighborsClassifier(n_neighbors=8,
                                   algorithm='kd_tree',
                                   leaf_size=40,
                                   metric='euclidean')
    #pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)])
    first = True
    train = 200
    if train > 0:
        X, y = stream.next_instance(train)
        #pipe.partial_fit(X, y, classes=stream.get_classes())
        #pipe.partial_fit(X, y, classes=stream.get_classes())
        #pipe2.fit(X, y)

        knn.partial_fit(X, y, classes=stream.get_classes())
        compare.fit(X, y)
        first = False
    n_samples = 0
    max_samples = 10000
    my_corrects = 0
    compare_corrects = 0

    while n_samples < max_samples:
        if n_samples % (max_samples / 20) == 0:
            logging.info('%s%%', str((n_samples // (max_samples / 20) * 5)))
        X, y = stream.next_instance()
        #my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        #my_pred = [1]
        if first:
            #pipe.partial_fit(X, y, classes=stream.get_classes())
            #pipe.partial_fit(X, y, classes=stream.get_classes())
            knn.partial_fit(X, y, classes=stream.get_classes())
            first = False
        else:
            #pipe.partial_fit(X, y)
            knn.partial_fit(X, y)
        #compare_pred = pipe2.predict(X)
        compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        if y[0] == compare_pred[0]:
            compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end - start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects / n_samples))
    print('Compare performance: ' + str(compare_corrects / n_samples))