Beispiel #1
0
def train_tree(csv_path, tree):

    print("Training the tree")

    stream = FileStream(csv_path)

    accuracy = 0
    n_samples = 0
    correct_cnt = 0

    t0 = time.time()

    while stream.has_more_samples():
        X, y = stream.next_sample()
        y_pred = tree.predict(X)
        if y[0] == y_pred[0]:
            correct_cnt += 1
        tree = tree.partial_fit(X, y)
        n_samples += 1

    t1 = time.time()
    total = t1 - t0

    accuracy = 100.0 * correct_cnt / n_samples

    print("Training data instances: ", n_samples)
    print("Tree trained on ", n_samples, " instances & has ", accuracy,
          "% accuracy.")
    print("Training tree completed in ", total, " (s)")
Beispiel #2
0
def test_file_stream(test_path, package_path):
    test_file = os.path.join(package_path,
                             'src/skmultiflow/data/datasets/sea_stream.csv')
    stream = FileStream(test_file)
    stream.prepare_for_use()

    assert stream.n_remaining_samples() == 40000

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == 'sea_stream.csv - 1 target(s), 2 classes'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream_file.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]

    assert 'stream' == stream._estimator_type

    expected_info = "FileStream(filename='sea_stream.csv', target_idx=-1, n_targets=1, cat_features=None)"
    assert stream.get_info() == expected_info
Beispiel #3
0
def demo():
    """ _test_mol

    This demo tests the MOL learner on a file stream, which reads from 
    the music.csv file.

    The test computes the performance of the MOL learner as well as 
    the time to create the structure and classify all the samples in 
    the file.

    """
    # Setup logging
    logging.basicConfig(format='%(message)s', level=logging.INFO)

    # Setup the file stream
    stream = FileStream("../datasets/music.csv", 0, 6)
    stream.prepare_for_use()

    # Setup the classifier, by default it uses Logistic Regression
    # classifier = MultiOutputLearner()
    # classifier = MultiOutputLearner(h=SGDClassifier(n_iter=100))
    classifier = MultiOutputLearner(h=Perceptron())

    # Setup the pipeline
    pipe = Pipeline([('classifier', classifier)])

    pretrain_size = 150
    logging.info('Pre training on %s samples', str(pretrain_size))
    X, y = stream.next_sample(pretrain_size)
    # classifier.fit(X, y)
    pipe.partial_fit(X, y, classes=stream.get_targets())
    count = 0
    true_labels = []
    predicts = []
    init_time = timer()
    logging.info('Evaluating...')
    while stream.has_more_samples():
        X, y = stream.next_sample()
        # p = classifier.predict(X)
        p = pipe.predict(X)
        predicts.extend(p)
        true_labels.extend(y)
        count += 1
    perf = hamming_score(true_labels, predicts)
    logging.info('Evaluation time: %s s', str(timer() - init_time))
    logging.info('Total samples analyzed: %s', str(count))
    logging.info('The classifier\'s static Hamming score    : %0.3f' % perf)
Beispiel #4
0
def test_tree(csv_path, tree):

    print("Testing the tree")
    
    stream = FileStream(csv_path)
    
    
    n_samples = 0
    correct_cnt = 0
    
    t2 = time.time()
    
    y_true_all = list()
    y_pred_all = list()
    while stream.has_more_samples():
        X, y = stream.next_sample()
        y_pred = tree.predict(X)
        if y[0] == y_pred[0]:
            correct_cnt += 1
        tree = tree.partial_fit(X, y)
        n_samples += 1
        
        y_true_all.append(y[0])
        y_pred_all.append(y_pred[0])
    
    
    t3 = time.time()
    total = t3-t2
    
    accuracy = 100.0 * correct_cnt / n_samples
    fscore = f1_score(y_true_all, y_pred_all, average='binary')
    gm = geometric_mean_score(y_true_all, y_pred_all, average='binary')
    
    print("Test data instances: ", n_samples)
    print("Tree tested on ", n_samples, " instances & has ", accuracy, "% accuracy.")
    print("Tree has F-score: %.3f" % fscore)
    print("Tree has GM: %.3f" % gm)
    print("Testing tree completed in ", total, " (s)")
    
    return round(fscore,3), round(gm,3)
# Retrieving 5 samples
data_stream.next_sample(5)
# Output-
#(array([[ 36.   ,   0.   ,   7.   ,   3.   ,   1.   , 118.   ,  13.   ,
#          18.   ,  50.   , 239.554,  97.   ,   1.   ,   1.   ,   1.   ,
#           1.   ,   0.   ,   0.   ,  98.   , 178.   ,  31.   ],
#        [  3.   ,  23.   ,   7.   ,   4.   ,   1.   , 179.   ,  51.   ,
#          18.   ,  38.   , 239.554,  97.   ,   0.   ,   1.   ,   0.   ,
#           1.   ,   0.   ,   0.   ,  89.   , 170.   ,  31.   ],
#        [  7.   ,   7.   ,   7.   ,   5.   ,   1.   , 279.   ,   5.   ,
#          14.   ,  39.   , 239.554,  97.   ,   0.   ,   1.   ,   2.   ,
#           1.   ,   1.   ,   0.   ,  68.   , 168.   ,  24.   ],
#        [ 11.   ,  23.   ,   7.   ,   5.   ,   1.   , 289.   ,  36.   ,
#          13.   ,  33.   , 239.554,  97.   ,   0.   ,   1.   ,   2.   ,
#           1.   ,   0.   ,   1.   ,  90.   , 172.   ,  30.   ],
#        [  3.   ,  23.   ,   7.   ,   6.   ,   1.   , 179.   ,  51.   ,
#          18.   ,  38.   , 239.554,  97.   ,   0.   ,   1.   ,   0.   ,
#           1.   ,   0.   ,   0.   ,  89.   , 170.   ,  31.   ]]),
# array([0, 2, 4, 2, 2]))

data_stream.has_more_samples()
# Output-
# True

data_stream.n_remaining_samples()
# Output-
# 734

#####################################################################################
Beispiel #6
0
X_init, y_init = stream.next_sample(CHUNK_SIZE)
print(X_init)
print(y_init)
goowe.partial_fit(X_init, y_init)

accuracy = 0.0
total = 0.0
true_predictions = 0.0

for i in range(CHUNK_SIZE):
     total += 1
     cur = stream.next_sample()
     X, y = cur[0], cur[1]
     preds = goowe.predict(X)
     true_predictions += np.sum(preds == y)
     accuracy = true_predictions / total
     print('\tData instance: {} - Accuracy: {}'.format(total, accuracy))
     goowe.partial_fit(X, y)

# Now, for the remaining instances, do ITTT (Interleaved Test Then Train).
while(stream.has_more_samples()):
    total += 1
    cur = stream.next_sample()
    X, y = cur[0], cur[1]
    preds = goowe.predict(X)            # Test
    true_predictions += np.sum(preds == y)
    accuracy = true_predictions / total
    print('\tData instance: {} - Accuracy: {}'.format(int(total), round(accuracy*100.0, 3)))
    goowe.partial_fit(X, y)             # Then train