Beispiel #1
0
def demo():
    """ _test_streams
    
    This demo tests if the streams are correctly generating samples.
    
    :return: 
    """
    stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
                        "master/covtype.csv")

    rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_seed=32523423, instance_seed=5435,
                                        n_classes=2, n_features=10, num_drift_centroids=50)

    sea = SEAGenerator()

    print('1 instance:\n')

    X,y = stream.next_sample()
    print(X)
    print(y)

    X, y = sea.next_sample()
    print(X)
    print(y)

    print('\n\n10 instances:\n')
    X,y = stream.next_sample(10)
    print(X)
    print(y)

    X, y = sea.next_sample(10)
    print(X)
    print(y)
Beispiel #2
0
def demo():
    """ _test_knn
    
    This demo tests the KNNClassifier on a file stream, which gives
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNNClassifier as well as
    the time to create the structure and classify max_samples (5000 by 
    default) instances.
    
    """
    stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
                        "master/sea_big.csv")

    train = 200
    X, y = stream.next_sample(train)
    # t = OneHotToCategorical([[10, 11, 12, 13],
    #                         [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                          36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])
    # t2 = OneHotToCategorical([[10, 11, 12, 13],
    #                         [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                          36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])
    start = timer()
    knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40)
    # pipe = Pipeline([('one_hot_to_categorical', t), ('KNNClassifier', knn)])

    # compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean')

    # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNNClassifier', compare)])

    # pipe.fit(X, y)
    # pipe2.fit(X, y)
    knn.partial_fit(X, y)
    # compare.fit(X, y)

    n_samples = 0
    max_samples = 5000
    my_corrects = 0
    # compare_corrects = 0

    while n_samples < max_samples:
        X, y = stream.next_sample()
        # my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        # compare_pred = pipe2.predict(X)
        # compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        # if y[0] == compare_pred[0]:
        #     compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end-start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects/n_samples))
Beispiel #3
0
def demo():
    """ _test_mol

    This demo tests the MOL learner on a file stream, which reads from 
    the music.csv file.

    The test computes the performance of the MOL learner as well as 
    the time to create the structure and classify all the samples in 
    the file.

    """
    # Setup logging
    logging.basicConfig(format='%(message)s', level=logging.INFO)

    # Setup the file stream
    stream = FileStream("../data/datasets/music.csv", 0, 6)
    stream.prepare_for_use()

    # Setup the classifier, by default it uses Logistic Regression
    # classifier = MultiOutputLearner()
    # classifier = MultiOutputLearner(base_estimator=SGDClassifier(n_iter=100))
    classifier = MultiOutputLearner(base_estimator=Perceptron())

    # Setup the pipeline
    pipe = Pipeline([('classifier', classifier)])

    pretrain_size = 150
    logging.info('Pre training on %s samples', str(pretrain_size))
    logging.info('Total %s samples', str(stream.n_samples))
    X, y = stream.next_sample(pretrain_size)
    # classifier.fit(X, y)
    classes = stream.target_values
    classes_flat = list(set([item for sublist in classes for item in sublist]))
    pipe.partial_fit(X, y, classes=classes_flat)
    count = 0
    true_labels = []
    predicts = []
    init_time = timer()
    logging.info('Evaluating...')
    while stream.has_more_samples():
        X, y = stream.next_sample()
        # p = classifier.predict(X)
        p = pipe.predict(X)
        predicts.extend(p)
        true_labels.extend(y)
        count += 1
    perf = hamming_score(true_labels, predicts)
    logging.info('Evaluation time: %s s', str(timer() - init_time))
    logging.info('Total samples analyzed: %s', str(count))
    logging.info('The classifier\'s static Hamming score    : %0.3f' % perf)
Beispiel #4
0
def demo():
    """ _test_streams
    
    This demo tests if the streams are correctly generating samples.
    
    :return: 
    """
    stream = FileStream('../data/datasets/covtype.csv', -1, 1)
    stream.prepare_for_use()
    rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00,
                                        n_centroids=50,
                                        model_seed=32523423,
                                        instance_seed=5435,
                                        n_classes=2,
                                        n_features=10,
                                        num_drift_centroids=50)
    rbf_drift.prepare_for_use()

    sea = SEAGenerator()

    print('1 instance:\n')

    X, y = stream.next_sample()
    print(X)
    print(y)

    X, y = sea.next_sample()
    print(X)
    print(y)

    print('\n\n10 instances:\n')
    X, y = stream.next_sample(10)
    print(X)
    print(y)

    X, y = sea.next_sample(10)
    print(X)
    print(y)
def demo():
    """ _test_knn_adwin

    This demo tests the KNNAdwin classifier on a file stream, which gives 
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNNAdwin classifier as well as 
    the time to create the structure and classify max_samples (10000 by 
    default) instances.
    
    """
    start = timer()
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    # warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = FileStream('../data/datasets/sea_big.csv', -1, 1)
    # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423,
    #                                  sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50)
    stream.prepare_for_use()
    t = OneHotToCategorical([[10, 11, 12, 13],
                             [
                                 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                 47, 48, 49, 50, 51, 52, 53
                             ]])
    t2 = OneHotToCategorical([[10, 11, 12, 13],
                              [
                                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                  25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                  36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                  47, 48, 49, 50, 51, 52, 53
                              ]])

    # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40)
    knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000)
    # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)])

    compare = KNeighborsClassifier(n_neighbors=8,
                                   algorithm='kd_tree',
                                   leaf_size=40,
                                   metric='euclidean')
    # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)])
    first = True
    train = 200
    if train > 0:
        X, y = stream.next_sample(train)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe2.fit(X, y)

        knn.partial_fit(X, y, classes=stream.target_values)
        compare.fit(X, y)
        first = False
    n_samples = 0
    max_samples = 10000
    my_corrects = 0
    compare_corrects = 0

    while n_samples < max_samples:
        if n_samples % (max_samples / 20) == 0:
            logging.info('%s%%', str((n_samples // (max_samples / 20) * 5)))
        X, y = stream.next_sample()
        # my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        # my_pred = [1]
        if first:
            # pipe.partial_fit(X, y, classes=stream.target_values)
            # pipe.partial_fit(X, y, classes=stream.target_values)
            knn.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            # pipe.partial_fit(X, y)
            knn.partial_fit(X, y)
        # compare_pred = pipe2.predict(X)
        compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        if y[0] == compare_pred[0]:
            compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end - start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects / n_samples))
    print('Compare performance: ' + str(compare_corrects / n_samples))
Beispiel #6
0
def demo():
    """ _test_kdtree_compare
    
    This demo compares creation and query speed for different kd tree 
    implementations. They are fed with instances from the covtype dataset. 
    
    Three kd tree implementations are compared: SciPy's KDTree, NumPy's 
    KDTree and scikit-multiflow's KDTree. For each of them the demo will 
    time the construction of the tree on 1000 instances, and then measure 
    the time to query 100 instances. The results are displayed in the 
    terminal.
    
    """
    warnings.filterwarnings("ignore", ".*Passing 1d.*")

    stream = FileStream('../data/datasets/covtype.csv', -1, 1)

    filter = OneHotToCategorical([[10, 11, 12, 13],
                                  [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                   34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    X, y = stream.next_sample(1000)
    X = filter.transform(X)
    # print(X)

    X_find, y = stream.next_sample(100)
    X_find = filter.transform(X_find)
    print(X_find[4])
    # Normal kdtree
    start = timer()
    scipy = spatial.KDTree(X, leafsize=40)
    end = timer()
    print("\nScipy KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(10):
        ind = scipy.query(X_find[i], 8)
        # print(ind)
    end = timer()
    print("Scipy KDTree query time: " + str(end - start))

    del scipy

    # Fast kdtree
    start = timer()
    opt = KDTree(X, metric='euclidean', return_distance=True)
    end = timer()
    print("\nOptimal KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = opt.query(X_find[i], 8)
        # print(ind)
        # print(dist)
    end = timer()
    print("Optimal KDTree query time: " + str(end - start))

    del opt

    # Sklearn kdtree
    start = timer()
    sk = ng.KDTree(X, metric='euclidean')
    end = timer()
    print("\nSklearn KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True)
        # print(ind)
        # print(dist)
    end = timer()
    print("Sklearn KDTree query time: " + str(end - start) + "\n")

    del sk
    for i in range(r):
        pred = classifier.predict(np.asarray([X[i]]))
        predicted_labels.append(pred[0])
        true_labels.append(y[i])
        classifier = classifier.partial_fit(np.asarray([X[i]]), np.asarray([y[i]]), None)
        if (i % (r // 20)) == 0:
            logging.info(str((i // (r / 20))*5) + "%")
    accuracy = accuracy_score(true_labels, predicted_labels)
    logging.info('error rate %.2f%%' % (100-100*accuracy))


if __name__ == '__main__':
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    hyperParams ={'maxSize': 1000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': 'maxACCApprox', 'useLTM': False}
    # hyperParams = {'windowSize': 5000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': None,
    #               'useLTM': False}

    logging.info('loading dataset')
    # stream = FileStream("../data/datasets/weather.csv")
    stream = FileStream("../data/datasets/moving_squares.csv")
    stream = FileStream("/Users/jing/local/scikit-multiflow/src/skmultiflow/data/datasets/covtype.csv")

    stream.prepare_for_use()

    X, y = stream.next_sample(stream.n_samples)

    logging.info('%d samples' % X.shape[0])
    logging.info('%d dimensions' % X.shape[1])
    run(X[:], y[:], hyperParams)

nb = NaiveBayes()
ht = HoeffdingTreeClassifier()

evaluator = EvaluatePrequential( max_samples=10000,
max_time=1000,
show_plot=True,
pretrain_size= 3000,
metrics=['accuracy'])

#Single Bathc Classification
#stream = FileStream('file.csv')
stream = FileStream("RBF Dataset.csv")
stream10 = FileStream("RBF Dataset 10.csv")
stream70 = FileStream("RBF Dataset 70.csv")

X1,y1 = stream.next_sample(10000)
X2,y2 = stream10.next_sample(10000)
X3,y3 = stream70.next_sample(10000)

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size= 0.4,random_state=109)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size= 0.4,random_state=109)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size= 0.4,random_state=109)



#Classifiers
MLP = MLPClassifier(hidden_layer_sizes=(200, 4 ) ,random_state=1, max_iter=300)
nb = NaiveBayes()
ht = HoeffdingTreeClassifier()

# ht2_trained = ht.fit(X3_train,y3_train)