Exemple #1
0
def ARF_run (dataset_name, batch, num_copy):
    data = load_arff(path, dataset_name, num_copy)
    # data transform
    stream = DataStream(data)
    #print(stream)

    # Setup variables to control loop and track performance
    n_samples = 0
    max_samples = data.shape[0]

    # Train the classifier with the samples provided by the data stream
    pred = np.empty(0)
    np.random.seed(0)
    
    model = AdaptiveRandomForestClassifier()
    while n_samples < max_samples and stream.has_more_samples():
        X, y = stream.next_sample(batch)
        y_pred = model.predict(X)
        pred = np.hstack((pred,y_pred))
        model.partial_fit(X, y,stream.target_values)
        n_samples += batch

    # evaluate
    data = data.values
    Y = data[:,-1]
    acc = accuracy_score(Y[batch:], pred[batch:])
    f1 = f1_score(Y[batch:], pred[batch:], average='macro')
    #print (Y[batch:].shape, pred[batch:].shape)
    print("acc:",acc)
    print("f1:",f1)
    
    # save results
    result = np.zeros([pred[batch:].shape[0], 2])
    result[:, 0] = pred[batch:]
    result[:, 1] = Y[batch:]
def test_adaptive_random_forests_nba():
    stream = RandomTreeGenerator(tree_random_state=112,
                                 sample_random_state=112,
                                 n_classes=2)

    learner = AdaptiveRandomForestClassifier(n_estimators=3,
                                             random_state=112)

    X, y = get_next_n_samples(stream, 150)
    learner.partial_fit(X, y, classes=[0, 1])  # labels given

    cnt = 0
    max_samples = 5000
    y_proba = []
    true_labels = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_proba.append(learner.predict_proba(X)[0])
            true_labels.append(y[0])

        learner.partial_fit(X, y)
        cnt += 1

    assert np.alltrue([np.isclose(probabilities.sum(), 1) for probabilities in y_proba]), \
        "Probabilities should sum to 1."

    y_proba = np.asarray(y_proba).squeeze()
    assert y_proba.shape == (49, 2)

    y_pred = y_proba.argmax(axis=1)
    y_pred_expected = [1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
                       1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
                       1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
                       1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
                       0, 0, 0, 1, 1, 1, 0, 0, 0]

    # Performance below does not need to be guaranteed. This check is set up so
    # that anything that changes to predictions are caught in the unit test.
    # This helps prevent accidental changes.

    assert type(learner.predict(X)) == np.ndarray
    assert np.alltrue(y_pred == y_pred_expected)

    expected_info = "AdaptiveRandomForestClassifier(binary_split=False, " \
                    "disable_weighted_vote=False, drift_detection_method=ADWIN(delta=0.001), " \
                    "grace_period=50, lambda_value=6, leaf_prediction='nba', " \
                    "max_byte_size=33554432, max_features=5, memory_estimate_period=2000000, " \
                    "n_estimators=3, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=None, performance_metric='acc', random_state=112, " \
                    "remove_poor_atts=False, split_confidence=0.01, " \
                    "split_criterion='info_gain', stop_mem_management=False, " \
                    "tie_threshold=0.05, warning_detection_method=ADWIN(delta=0.01))"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
Exemple #3
0
def test_adaptive_random_forests_mc():
    stream = RandomTreeGenerator(
        tree_random_state=112, sample_random_state=112, n_classes=2
    )
    stream.prepare_for_use()

    learner = AdaptiveRandomForestClassifier(n_estimators=3, leaf_prediction='mc',
                                             random_state=112)

    X, y = stream.next_sample(150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(int(learner.predict(X)[0]))
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1
    last_version_predictions = [0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
                                1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
                                1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
                                1]

    # Performance below does not need to be guaranteed. This check is set up so that anything that changes
    # to predictions are caught in the unit test. This helps prevent accidental changes.

    assert type(learner.predict(X)) == np.ndarray
    assert np.alltrue(predictions == last_version_predictions)

    expected_info = "AdaptiveRandomForestClassifier(binary_split=False, disable_weighted_vote=False,\n" \
                    "                               drift_detection_method=ADWIN(delta=0.001),\n" \
                    "                               grace_period=50, lambda_value=6,\n" \
                    "                               leaf_prediction='mc', max_byte_size=33554432,\n" \
                    "                               max_features=5, memory_estimate_period=2000000,\n" \
                    "                               n_estimators=3, nb_threshold=0,\n" \
                    "                               no_preprune=False, nominal_attributes=None,\n" \
                    "                               performance_metric='acc', random_state=112,\n" \
                    "                               remove_poor_atts=False, split_confidence=0.01,\n" \
                    "                               split_criterion='info_gain',\n" \
                    "                               stop_mem_management=False, tie_threshold=0.05,\n" \
                    "                               warning_detection_method=ADWIN(delta=0.01))"
    assert learner.get_info() == expected_info
def test_adaptive_forest():
    test_data_directory = os.path.join(TEST_DIRECTORY, 'data')
    test_file = os.path.join(
        test_data_directory,
        'test_data/weather.csv'
    )
    raw_data = pd.read_csv(test_file)
    stream1 = DataStream(raw_data, name='Test')
    stream2 = DataStream(raw_data, name='Test')
#    learner = ExtendedHoeffdingAdaptiveTree()
#    learner1 = AdaptiveHoeffdingTreeEnsemble(n_estimators=4)
    # stream1_learner = calculate_accuracy(learner, stream1, stream1.n_samples)
#    stream2_learner = calculate_accuracy(learner1, stream2, stream2.n_samples)
#    stream1_learner = calculate_accuracy(learner, stream1, stream1.n_samples)
    learner3 = AdaptiveRandomForestClassifier(n_estimators=10)
    stream3_learner = calculate_accuracy(learner3, stream1, stream1.n_samples)
#    learner4 = StreamingRandomPatchesClassifier(n_estimators=3)
#    stream4_learner = calculate_accuracy(learner4, stream1, stream1.n_samples)
#    learner5 = DeepStreamLearner(classes=stream1.target_values)
#    stream5_learner = calculate_accuracy(learner5, stream1, stream1.n_samples)

    import pudb; pudb.set_trace()  # XXX BREAKPOINT
    assert 1 == 1
#    print(stream2_learner.base_estimator.accuracy)
    with open (
            os.path.join(test_data_directory, 'test_data/adaptive_test_result.txt'),
            '+w'
    ) as f:
        f.write('stream2 average_accuracy:')

    import pudb; pudb.set_trace()  # XXX BREAKPOINT
    assert 1 == 1
Exemple #5
0
def test_adaptive_random_forests_batch_predict_proba():
    stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2)
    stream.prepare_for_use()

    learner = AdaptiveRandomForestClassifier(n_estimators=3,
                                             random_state=112)

    X, y = stream.next_sample(150)
    learner.partial_fit(X, y, classes=[0, 1])

    cnt = 0
    max_samples = 500
    predictions = []
    true_labels = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample(5)
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            p = learner.predict_proba(X)
            assert p.shape == (5, 2)
            predictions.append(p)
            true_labels.append(y)
        learner.partial_fit(X, y)
        cnt += 1

    all_predictions = np.concatenate(predictions)
    # all_true_labels = np.asarray(true_labels).flatten()
    # correct_predictions = sum(np.equal(all_true_labels, all_predictions.argmax(axis=1)))

    assert np.alltrue([np.isclose(y_proba.sum(), 1) for y_proba in all_predictions]), "Probabilities should sum to 1."
    assert all_predictions.shape == (4 * 5, 2)

    last_version_predictions = [1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1]
    assert type(learner.predict_proba(X)) == np.ndarray
    assert np.alltrue(all_predictions.argmax(axis=1) == last_version_predictions)
Exemple #6
0
def test_adaptive_random_forests_labels_given():
    stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2)
    stream.prepare_for_use()

    learner = AdaptiveRandomForestClassifier(n_estimators=3,
                                             random_state=112)

    X, y = stream.next_sample(150)
    learner.partial_fit(X, y, classes=[0, 1])

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict_proba(X)[0])
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1].argmax()):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1

    assert np.alltrue([np.isclose(y_proba.sum(), 1) for y_proba in predictions]), "Probabilities should sum to 1."

    class_probabilities = np.asarray(predictions).squeeze()
    assert class_probabilities.shape == (49, 2)

    predictions = class_probabilities.argmax(axis=1)
    last_version_predictions = [1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
                                1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0]

    assert np.alltrue(predictions == last_version_predictions)
Exemple #7
0
    stream = FileStream(path)
    evaluator = flow_detection_classifier(classifier, stream)
    stream = evaluator.stream.y
    return stream


# Streams based on classifiers:
# KNNClassifier
make_stream(PATH, KNNClassifier(n_neighbors=5, max_window_size=1000, leaf_size=30))
make_stream(PATH, KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40))
#
# # HoeffdingTreeClassifier
make_stream(PATH, HoeffdingTreeClassifier(memory_estimate_period=1000000, grace_period=200, leaf_prediction='nba'))
make_stream(PATH, HoeffdingTreeClassifier(memory_estimate_period=2000000, grace_period=300, leaf_prediction='mc'))
#
# # AdditiveExpertEnsembleClassifier
make_stream(PATH, AdditiveExpertEnsembleClassifier(n_estimators=5, beta=0.8, gamma=0.1, pruning='weakest'))
make_stream(PATH, AdditiveExpertEnsembleClassifier(n_estimators=8, beta=0.9, gamma=0.3, pruning='oldest'))
#
# # VeryFastDecisionRulesClassifier
make_stream(PATH, VeryFastDecisionRulesClassifier(grace_period=200, tie_threshold=0.05, max_rules=20))
make_stream(PATH, VeryFastDecisionRulesClassifier(grace_period=300, tie_threshold=0.1, max_rules=30))
#
# # AdaptiveRandomForestClassifier
make_stream(PATH, AdaptiveRandomForestClassifier(n_estimators=10, lambda_value=6, performance_metric='acc'))
make_stream(PATH, AdaptiveRandomForestClassifier(n_estimators=20, lambda_value=8, performance_metric='kappa'))

# # AccuracyWeightedEnsembleClassifier
make_stream(PATH, AccuracyWeightedEnsembleClassifier(n_estimators=10, n_kept_estimators=30, window_size=200, n_splits=5))
make_stream(PATH, AccuracyWeightedEnsembleClassifier(n_estimators=15, n_kept_estimators=40, window_size=300, n_splits=8))
#############################################################################################

# Applying Adaptive Random Forest Classifier on a synthetic data stream
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.evaluation import EvaluatePrequential
from skmultiflow.data.sea_generator import SEAGenerator

# Simulate the data stream
dstream = SEAGenerator(classification_function=2,
                       balance_classes=True,
                       noise_percentage=0.3,
                       random_state=333)

# Instatntiate the KNN ADWIN classifier method
ARF_class = AdaptiveRandomForestClassifier(n_estimators=100,
                                           max_features="sqrt",
                                           random_state=333)

# Prequential Evaluation
evaluate1 = EvaluatePrequential(show_plot=False,
                                pretrain_size=1000,
                                max_samples=10000,
                                metrics=['accuracy'])
# Run the evaluation
evaluate1.evaluate(stream=dstream, model=ARF_class)

###################################################

### ARF regressor
# Import the relevant libraries
from skmultiflow.meta import AdaptiveRandomForestRegressor
Exemple #9
0
from skmultiflow.data import SEAGenerator
from skmultiflow.meta import AdaptiveRandomForestClassifier

stream = SEAGenerator(random_state=1)

# Setup Adaptive Random Forest Classifier
arf = AdaptiveRandomForestClassifier()

# Setup variables to control loop and track performance
n_samples = 0
correct_cnt = 0
max_samples = 1200

# Train the estimator with the samples provided by the data stream
while n_samples < max_samples and stream.has_more_samples():
    X, y = stream.next_sample()
    y_pred = arf.predict(X)
    if y[0] == y_pred[0]:
        correct_cnt += 1
    arf.partial_fit(X, y)
    n_samples += 1

# Display results
print('Adaptive Random Forest ensemble classifier example')
print('{} samples analyzed.'.format(n_samples))
print('Accuracy: {}'.format(correct_cnt / n_samples))
    recall = cm.data[(i,i)]/cm.sum_col[i] \
    if cm.sum_col[i] != 0 else 'Ill-defined'
    print("Class {}: {}".format(i, recall))
'''
#------------------------------------------------Experiment 3--------------------------------------------------------------- 
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.meta import LeveragingBaggingClassifier
# Read in stream
stream = FileStream(r"C:\Users\luyj0\OneDrive\Desktop\COMPX523-Data Stream Mining\covtype_numeric.csv")
# Set up different classifiers
knn = MyKNNClassifier()
ht = HoeffdingTreeClassifier()
nb = NaiveBayes()
wv_knn = MyKNNClassifier(weighted_vote=True)
s_knn = MyKNNClassifier(standardize=True)
arf = AdaptiveRandomForestClassifier()
lb = LeveragingBaggingClassifier()
# Set up two ensemble algorithms
metrics = ['accuracy', 'kappa', 'kappa_m','kappa_t', 'running_time', 'model_size']
# use a test-then-train evaluation approach
evaluator = EvaluatePrequential(max_samples=30000,
                                n_wait=100,
                                show_plot=False,
                                metrics=metrics)

model_list = [knn,ht,nb,wv_knn,s_knn,arf,lb]
name_list = ['KNN','HoeffdingTree','NaiveBayes','KNN+WeightedVote','KNN+Standardize','AdaptiveRandomForest','Leverage Bagging']
# Execute each evaluation in the list until it reaches the end
for index in range(len(model_list)):
    evaluator.evaluate(stream=stream,model=[model_list[index]],model_names=[name_list[index]])
    cm = evaluator.get_mean_measurements(0).confusion_matrix