Esempio n. 1
0
def demo():
    """ _test_streams
    
    This demo tests if the streams are correctly generating samples.
    
    :return: 
    """
    stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
                        "master/covtype.csv")

    rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_seed=32523423, instance_seed=5435,
                                        n_classes=2, n_features=10, num_drift_centroids=50)

    sea = SEAGenerator()

    print('1 instance:\n')

    X,y = stream.next_sample()
    print(X)
    print(y)

    X, y = sea.next_sample()
    print(X)
    print(y)

    print('\n\n10 instances:\n')
    X,y = stream.next_sample(10)
    print(X)
    print(y)

    X, y = sea.next_sample(10)
    print(X)
    print(y)
def demo(instances=2000):
    """ _test_comparison_prequential
    
    This demo will test a prequential evaluation when more than one learner is 
    passed, which makes it a comparison task.
    
    Parameters
    ----------
    instances: int
        The evaluation's maximum number of instances.
     
    """
    # Stream setup
    stream = FileStream("../data/datasets/covtype.csv", -1, 1)
    # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False)
    stream.prepare_for_use()
    # Setup the classifier
    clf = SGDClassifier()
    # classifier = KNNAdwin(n_neighbors=8, max_window_size=2000,leaf_size=40, categorical_list=None)
    # classifier = OzaBaggingAdwin(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    clf_one = KNNAdwin(n_neighbors=8, max_window_size=1000, leaf_size=30)
    # clf_two = KNN(n_neighbors=8, max_window_size=1000, leaf_size=30)
    # clf_two = LeverageBagging(base_estimator=KNN(), n_estimators=2)

    t_one = OneHotToCategorical([[10, 11, 12, 13],
                                 [
                                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                     24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                     34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                                     44, 45, 46, 47, 48, 49, 50, 51, 52, 53
                                 ]])
    # t_two = OneHotToCategorical([[10, 11, 12, 13],
    #                        [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)])
    # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)])

    classifier = [clf, pipe_one]
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    # pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=2000,
        output_file='test_comparison_prequential.csv',
        max_samples=instances,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        show_plot=True,
        metrics=['performance', 'kappa_t'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
Esempio n. 3
0
def demo():
    """ _test_knn
    
    This demo tests the KNNClassifier on a file stream, which gives
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNNClassifier as well as
    the time to create the structure and classify max_samples (5000 by 
    default) instances.
    
    """
    stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
                        "master/sea_big.csv")

    train = 200
    X, y = stream.next_sample(train)
    # t = OneHotToCategorical([[10, 11, 12, 13],
    #                         [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                          36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])
    # t2 = OneHotToCategorical([[10, 11, 12, 13],
    #                         [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                          36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])
    start = timer()
    knn = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40)
    # pipe = Pipeline([('one_hot_to_categorical', t), ('KNNClassifier', knn)])

    # compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean')

    # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNNClassifier', compare)])

    # pipe.fit(X, y)
    # pipe2.fit(X, y)
    knn.partial_fit(X, y)
    # compare.fit(X, y)

    n_samples = 0
    max_samples = 5000
    my_corrects = 0
    # compare_corrects = 0

    while n_samples < max_samples:
        X, y = stream.next_sample()
        # my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        # compare_pred = pipe2.predict(X)
        # compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        # if y[0] == compare_pred[0]:
        #     compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end-start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects/n_samples))
Esempio n. 4
0
def demo(output_file=None, instances=50000):
    """ _test_sam_knn_prequential

    This demo shows how to produce a prequential evaluation.

    The first thing needed is a stream. For this case we use a file stream 
    which gets its samples from the movingSquares.csv file, inside the datasets 
    folder.

    Then we need to setup a classifier, which in this case is an instance 
    of scikit-multiflow's SAMKNN. Then, optionally we create a 
    pipeline structure, initialized on that classifier.

    The evaluation is then run.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    stream = FileStream("../data/datasets/movingSquares.csv", -1, 1)
    # stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    # classifier = SGDClassifier()
    # classifier = KNNAdwin(n_neighbors=8, max_window_size=2000,leaf_size=40, categorical_list=None)
    # classifier = OzaBaggingAdwin(base_estimator=KNN(n_neighbors=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    classifier = SAMKNN(n_neighbors=5,
                        weighting='distance',
                        max_window_size=1000,
                        stm_size_option='maxACCApprox',
                        use_ltm=False)
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    # pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=0,
                                    max_samples=instances,
                                    batch_size=1,
                                    n_wait=100,
                                    max_time=1000,
                                    output_file=output_file,
                                    show_plot=True,
                                    metrics=['performance'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
Esempio n. 5
0
def demo_parameterized(h, filename="covtype.csv", show_plot=True, model_names=None):
    # Setup Stream
    stream = FileStream("../data/datasets/" + filename)
    stream.prepare_for_use()

    # For each classifier, e...
    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_parametrized.csv', max_samples=10000,
                                    batch_size=1, n_wait=500, show_plot=show_plot)
    evaluator.evaluate(stream=stream, model=h, model_names=model_names)
Esempio n. 6
0
def demo():
    """ _test_mol

    This demo tests the MOL learner on a file stream, which reads from 
    the music.csv file.

    The test computes the performance of the MOL learner as well as 
    the time to create the structure and classify all the samples in 
    the file.

    """
    # Setup logging
    logging.basicConfig(format='%(message)s', level=logging.INFO)

    # Setup the file stream
    stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
                        "master/music.csv", 0, 6)

    # Setup the classifier, by default it uses Logistic Regression
    # classifier = MultiOutputLearner()
    # classifier = MultiOutputLearner(base_estimator=SGDClassifier(n_iter=100))
    classifier = MultiOutputLearner(base_estimator=Perceptron())

    # Setup the pipeline
    pipe = Pipeline([('classifier', classifier)])

    pretrain_size = 150
    logging.info('Pre training on %s samples', str(pretrain_size))
    logging.info('Total %s samples', str(stream.n_samples))
    X, y = stream.next_sample(pretrain_size)
    # classifier.fit(X, y)
    classes = stream.target_values
    classes_flat = list(set([item for sublist in classes for item in sublist]))
    pipe.partial_fit(X, y, classes=classes_flat)
    count = 0
    true_labels = []
    predicts = []
    init_time = timer()
    logging.info('Evaluating...')
    while stream.has_more_samples():
        X, y = stream.next_sample()
        # p = classifier.predict(X)
        p = pipe.predict(X)
        predicts.extend(p)
        true_labels.extend(y)
        count += 1
    perf = hamming_score(true_labels, predicts)
    logging.info('Evaluation time: %s s', str(timer() - init_time))
    logging.info('Total samples analyzed: %s', str(count))
    logging.info('The classifier\'s static Hamming score    : %0.3f' % perf)
Esempio n. 7
0
def demo(output_file=None, instances=50000):
    """ _test_sam_knn_prequential

    This demo shows how to produce a prequential evaluation.

    The first thing needed is a stream. For this case we use a file stream 
    which gets its samples from the moving_squares.csv file, inside the datasets 
    folder.

    Then we need to setup a classifier, which in this case is an instance 
    of scikit-multiflow's SAMKNNClassifier. Then, optionally we create a
    pipeline structure, initialized on that classifier.

    The evaluation is then run.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    stream = FileStream("../data/datasets/moving_squares.csv")
    # stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    classifier = SAMKNNClassifier(n_neighbors=5,
                                  weighting='distance',
                                  max_window_size=1000,
                                  stm_size_option='maxACCApprox',
                                  use_ltm=False)

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=0,
                                    max_samples=instances,
                                    batch_size=1,
                                    n_wait=100,
                                    max_time=1000,
                                    output_file=output_file,
                                    show_plot=True)

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
def demo():

    # The classifier we will use (other options: SAMKNNClassifier, LeverageBaggingClassifier, SGD)
    h = HoeffdingTreeClassifier()

    # Setup Stream
    stream = FileStream("../data/datasets/sea_stream.csv")
    stream.prepare_for_use()

    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain,
                                    output_file='test_filestream.csv',
                                    max_samples=10000,
                                    batch_size=1,
                                    n_wait=1000,
                                    show_plot=True)
    evaluator.evaluate(stream=stream, model=h)
Esempio n. 9
0
def demo_parameterized(h, filename="covtype.csv", show_plot=True, model_names=None):
    # Setup Stream
    stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
                        "master/" + filename)

    # For each classifier, e...
    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_parametrized.csv', max_samples=10000,
                                    batch_size=1, n_wait=500, show_plot=show_plot)
    evaluator.evaluate(stream=stream, model=h, model_names=model_names)
Esempio n. 10
0
def demo():
    """ _test_streams
    
    This demo tests if the streams are correctly generating samples.
    
    :return: 
    """
    stream = FileStream('../data/datasets/covtype.csv', -1, 1)

    rbf_drift = RandomRBFGeneratorDrift(change_speed=41.00,
                                        n_centroids=50,
                                        model_seed=32523423,
                                        instance_seed=5435,
                                        n_classes=2,
                                        n_features=10,
                                        num_drift_centroids=50)

    sea = SEAGenerator()

    print('1 instance:\n')

    X, y = stream.next_sample()
    print(X)
    print(y)

    X, y = sea.next_sample()
    print(X)
    print(y)

    print('\n\n10 instances:\n')
    X, y = stream.next_sample(10)
    print(X)
    print(y)

    X, y = sea.next_sample(10)
    print(X)
    print(y)
def demo():

    # The classifier we will use (other options: SAMKNNClassifier, LeveragingBaggingClassifier, SGD)
    h = HoeffdingTreeClassifier()

    # Setup Stream
    stream = FileStream(
        "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
        "master/sea_stream.csv")

    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain,
                                    output_file='test_filestream.csv',
                                    max_samples=10000,
                                    batch_size=1,
                                    n_wait=1000,
                                    show_plot=True)
    evaluator.evaluate(stream=stream, model=h)
Esempio n. 12
0
import sys
sys.path.append("..")
from skmultiflow.data import FileStream
from evaluation.evaluate_prequential import EvaluatePrequential
from recommendation.random import RandomClassifier
from recommendation.popular import PopularClassifier
from recommendation.co_events import CoEventsClassifier
from recommendation.seq_events import SeqEventsClassifier
from recommendation.ht_wrapper import HTWrapper
from recommendation.beer import BeerEnsemble
from recommendation.sknn import SKNNClassifier

# Create stream
stream = FileStream("../data/clef_1M100K.csv")
stream.prepare_for_use()

# Instantiate recommenders
random = RandomClassifier()
ht = HTWrapper(weight_mc=5, weight_inv=0.01)
sknn = SKNNClassifier(k=200, sample_size=500, sample_recent=True,
                      similarity='cosine', sliding_window=True)
popular = PopularClassifier(sliding_window=True)
ar = CoEventsClassifier(sliding_window=False)
sr = SeqEventsClassifier(sliding_window=False)
mc = SeqEventsClassifier(steps_back=1, sliding_window=False)
beer = BeerEnsemble(cf_components=[ar, sr, mc, popular, sknn])

evaluator = EvaluatePrequential(session_column_index=0,
                                time_column_index=1,
                                rec_size=10,
                                allow_reminders=True,
stream2 = RandomRBFGeneratorDrift(model_random_state=99, sample_random_state = 50, n_classes = 2, n_features = 10,
n_centroids = 10000, change_speed= 10)
X, y = stream2.next_sample(10000)
stream2.restart()
df2 = pd.DataFrame(np.hstack((X,np.array([y]).T)))
df2.to_csv("RBF Dataset 10.csv")

stream3 = RandomRBFGeneratorDrift(model_random_state=99, sample_random_state = 50, n_classes = 2, n_features = 10,
n_centroids = 10000, change_speed= 70)
X, y = stream3.next_sample(10000)
stream3.restart()
df3 = pd.DataFrame(np.hstack((X,np.array([y]).T)))
df3.to_csv("RBF Dataset 70.csv")

#Single Online Classifiers
stream10 = FileStream("./"+'RBF Dataset 10'+'.csv')
stream70 = FileStream("./"+'RBF Dataset 70'+'.csv')
stream = FileStream("./"+'RBF Dataset'+'.csv')

MLP = MLPClassifier(hidden_layer_sizes=(200,200,200,200 ) ,random_state=1, max_iter=500)
nb = NaiveBayes()
ht = HoeffdingTreeClassifier()

evaluator = EvaluatePrequential( max_samples=10000,
max_time=1000,
show_plot=True,
pretrain_size= 3000,
metrics=['accuracy'])

#Ensemble Online
stream10 = FileStream("./"+'RBF Dataset 10'+'.csv')
Esempio n. 14
0
from skmultiflow.data import FileStream
from skmultiflow.lazy.knn import KNN
from skmultiflow.evaluation import EvaluatePrequential

n_neighbors = 8
max_window_size = 2000
leaf_size = 30
n_estimators = 30
show_plot = True
pretrain_size = 100
max_samples = 7000
metrics = ['accuracy']

stream = FileStream('data/stream1.csv')
stream.prepare_for_use()
mdl = KNN(n_neighbors=n_neighbors,
          max_window_size=max_window_size,
          leaf_size=leaf_size)
evaluator = EvaluatePrequential(show_plot=show_plot,
                                pretrain_size=pretrain_size,
                                max_samples=max_samples,
                                metrics=metrics)
evaluator.evaluate(stream=stream, model=mdl)
def demo(instances=2000):
    """ _test_comparison_prequential
    
    This demo will test a prequential evaluation when more than one learner is 
    passed, which makes it a comparison task.
    
    Parameters
    ----------
    instances: int
        The evaluation's maximum number of instances.
     
    """
    # Stream setup
    stream = FileStream(
        "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
        "master/covtype.csv")
    # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False)
    # Setup the classifier
    clf = SGDClassifier()
    # classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None)
    # classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000,
    #                                                                     leaf_size=30))
    clf_one = KNNADWINClassifier(n_neighbors=8,
                                 max_window_size=1000,
                                 leaf_size=30)
    # clf_two = KNNClassifier(n_neighbors=8, max_window_size=1000, leaf_size=30)
    # clf_two = LeveragingBaggingClassifier(base_estimator=KNNClassifier(), n_estimators=2)

    t_one = OneHotToCategorical([[10, 11, 12, 13],
                                 [
                                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                     24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                     34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                                     44, 45, 46, 47, 48, 49, 50, 51, 52, 53
                                 ]])
    # t_two = OneHotToCategorical([[10, 11, 12, 13],
    #                             [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
    #                              27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
    #                              40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    pipe_one = Pipeline([('one_hot_to_categorical', t_one),
                         ('KNNClassifier', clf_one)])
    # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNNClassifier', clf_two)])

    classifier = [clf, pipe_one]
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    # pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=2000,
        output_file='test_comparison_prequential.csv',
        max_samples=instances,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        show_plot=True)

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
Esempio n. 16
0
import sys
sys.path.append("..")
from skmultiflow.data import FileStream
from evaluation.evaluate_prequential import EvaluatePrequential
from recommendation.random import RandomClassifier
from recommendation.popular import PopularClassifier
from recommendation.co_events import CoEventsClassifier
from recommendation.seq_events import SeqEventsClassifier
from recommendation.ht_wrapper import HTWrapper
from recommendation.beer import BeerEnsemble
from recommendation.sknn import SKNNClassifier

# Create stream
stream = FileStream("../data/yoochoose_clicks_1M100K.csv")
stream.prepare_for_use()

# Instantiate recommenders
random = RandomClassifier()
ht = HTWrapper(weight_mc=3, weight_inv=0.9)
sknn = SKNNClassifier(k=300, sample_size=1500, sample_recent=True,
                      similarity='cosine', sliding_window=True)
popular = PopularClassifier(sliding_window=False)
ar = CoEventsClassifier(sliding_window=False)
sr = SeqEventsClassifier(sliding_window=False)
mc = SeqEventsClassifier(steps_back=1, sliding_window=False)
beer = BeerEnsemble(cf_components=[ar, sr, mc, popular, sknn], boundaries=[0.5])

evaluator = EvaluatePrequential(session_column_index=0,
                                time_column_index=1,
                                rec_size=10,
                                allow_reminders=True,
Esempio n. 17
0
    
    def transform_vector(self, X):
        r, c = get_dimensions(X)
        for i in range(r):
            row = np.copy([X[i][:]])
            for j in range(c):
                value = X[i][j]
                mean = self.calculate_mean(j)
                standard_deviation = self.calculate_stddev(j)
                standardized = (value - mean) / standard_deviation
                X[i][j] = standardized
            self.window.add_element(row)
        return X

#Read the stream 
stream = FileStream("C:/Users/jeffr/OneDrive/Desktop/Data Stream/Assignment_One/dataset/data_n30000.csv")
stream.prepare_for_use()

#stream.next_sample(10)
#stream.n_remaining_samples()
#X, y = stream.next_sample(5000)

metrics = ['accuracy', 'kappa', 'kappa_m', 'kappa_t', 'running_time', 'model_size'] 
evaluator = EvaluatePrequential(max_samples = 30000, n_wait = 100, show_plot = True, metrics = metrics) 

my_knn = MyKNNClassifier(standardize = True, weighted_vote = False)
evaluator.evaluate(stream = stream, model = [my_knn], model_names = ['My_KNN'])
cm = evaluator.get_mean_measurements(0).confusion_matrix
print("Recall per class")
for i in range(cm.n_classes):
    recall = cm.data[(i,i)]/cm.sum_col[i] \
                                show_plot=False,
                                metrics=metrics)
evaluator.evaluate(stream=stream,model=[knn],model_names=['KNN'])
cm = evaluator.get_mean_measurements(0).confusion_matrix
print("Recall per class")
# Recall = True Positive / (True Positive + False Negative)
for i in range(cm.n_classes):
    recall = cm.data[(i,i)]/cm.sum_col[i] \
    if cm.sum_col[i] != 0 else 'Ill-defined'
    print("Class {}: {}".format(i, recall))
'''
#------------------------------------------------Experiment 3--------------------------------------------------------------- 
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.meta import LeveragingBaggingClassifier
# Read in stream
stream = FileStream(r"C:\Users\luyj0\OneDrive\Desktop\COMPX523-Data Stream Mining\covtype_numeric.csv")
# Set up different classifiers
knn = MyKNNClassifier()
ht = HoeffdingTreeClassifier()
nb = NaiveBayes()
wv_knn = MyKNNClassifier(weighted_vote=True)
s_knn = MyKNNClassifier(standardize=True)
arf = AdaptiveRandomForestClassifier()
lb = LeveragingBaggingClassifier()
# Set up two ensemble algorithms
metrics = ['accuracy', 'kappa', 'kappa_m','kappa_t', 'running_time', 'model_size']
# use a test-then-train evaluation approach
evaluator = EvaluatePrequential(max_samples=30000,
                                n_wait=100,
                                show_plot=False,
                                metrics=metrics)
Esempio n. 19
0
import sys

sys.path.append("..")
from skmultiflow.data import FileStream
from evaluation.evaluate_prequential import EvaluatePrequential
from recommendation.random import RandomClassifier
from recommendation.popular import PopularClassifier
from recommendation.co_events import CoEventsClassifier
from recommendation.seq_events import SeqEventsClassifier
from recommendation.ht_wrapper import HTWrapper
from recommendation.beer import BeerEnsemble
from recommendation.sknn import SKNNClassifier

# Create stream
stream = FileStream("../data/trivago_1M100K.csv")
stream.prepare_for_use()

# Instantiate recommenders
random = RandomClassifier()
ht = HTWrapper(weight_mc=5, weight_inv=0.90)
sknn = SKNNClassifier(k=100,
                      sample_size=1000,
                      sample_recent=True,
                      similarity='cosine',
                      sliding_window=True)
popular = PopularClassifier(sliding_window=True)
ar = CoEventsClassifier(sliding_window=False)
sr = SeqEventsClassifier(sliding_window=False)
mc = SeqEventsClassifier(steps_back=1, sliding_window=False)
beer = BeerEnsemble(cf_components=[ar, sr, mc, popular, sknn])
def demo():
    """ _test_knn_adwin

    This demo tests the KNNAdwin classifier on a file stream, which gives 
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNNAdwin classifier as well as 
    the time to create the structure and classify max_samples (10000 by 
    default) instances.
    
    """
    start = timer()
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    # warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = FileStream('../data/datasets/sea_big.csv', -1, 1)
    # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423,
    #                                  sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50)
    stream.prepare_for_use()
    t = OneHotToCategorical([[10, 11, 12, 13],
                             [
                                 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                 47, 48, 49, 50, 51, 52, 53
                             ]])
    t2 = OneHotToCategorical([[10, 11, 12, 13],
                              [
                                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                  25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                  36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                  47, 48, 49, 50, 51, 52, 53
                              ]])

    # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40)
    knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000)
    # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)])

    compare = KNeighborsClassifier(n_neighbors=8,
                                   algorithm='kd_tree',
                                   leaf_size=40,
                                   metric='euclidean')
    # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)])
    first = True
    train = 200
    if train > 0:
        X, y = stream.next_sample(train)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe2.fit(X, y)

        knn.partial_fit(X, y, classes=stream.target_values)
        compare.fit(X, y)
        first = False
    n_samples = 0
    max_samples = 10000
    my_corrects = 0
    compare_corrects = 0

    while n_samples < max_samples:
        if n_samples % (max_samples / 20) == 0:
            logging.info('%s%%', str((n_samples // (max_samples / 20) * 5)))
        X, y = stream.next_sample()
        # my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        # my_pred = [1]
        if first:
            # pipe.partial_fit(X, y, classes=stream.target_values)
            # pipe.partial_fit(X, y, classes=stream.target_values)
            knn.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            # pipe.partial_fit(X, y)
            knn.partial_fit(X, y)
        # compare_pred = pipe2.predict(X)
        compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        if y[0] == compare_pred[0]:
            compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end - start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects / n_samples))
    print('Compare performance: ' + str(compare_corrects / n_samples))
Esempio n. 21
0
def make_stream(path, classifier):
    stream = FileStream(path)
    evaluator = flow_detection_classifier(classifier, stream)
    stream = evaluator.stream.y
    return stream
        classifier = classifier.partial_fit(np.asarray([X[i]]),
                                            np.asarray([y[i]]), None)
        if (i % (r // 20)) == 0:
            logging.info(str((i // (r / 20)) * 5) + "%")
    accuracy = accuracy_score(true_labels, predicted_labels)
    logging.info('error rate %.2f%%' % (100 - 100 * accuracy))


if __name__ == '__main__':
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    hyperParams = {
        'maxSize': 1000,
        'nNeighbours': 5,
        'knnWeights': 'distance',
        'STMSizeAdaption': 'maxACCApprox',
        'use_ltm': False
    }
    # hyperParams = {'windowSize': 5000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': None,
    #               'use_ltm': False}

    logging.info('loading dataset')
    # stream = FileStream("../data/datasets/weather.csv")
    stream = FileStream("../data/datasets/moving_squares.csv")
    stream.prepare_for_use()

    X, y = stream.next_sample(stream.n_samples)

    logging.info('%d samples' % X.shape[0])
    logging.info('%d dimensions' % X.shape[1])
    run(X[:], y[:], hyperParams)
Esempio n. 23
0
    true_labels = []
    for i in range(r):
        pred = classifier.predict(np.asarray([X[i]]))
        predicted_labels.append(pred[0])
        true_labels.append(y[i])
        classifier = classifier.partial_fit(np.asarray([X[i]]), np.asarray([y[i]]), None)
        if (i % (r // 20)) == 0:
            logging.info(str((i // (r / 20))*5) + "%")
    accuracy = accuracy_score(true_labels, predicted_labels)
    logging.info('error rate %.2f%%' % (100-100*accuracy))


if __name__ == '__main__':
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    hyperParams ={'maxSize': 1000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': 'maxACCApprox', 'use_ltm': False}
    # hyperParams = {'windowSize': 5000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': None,
    #               'use_ltm': False}

    logging.info('loading dataset')
    # stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
    #                     "master/weather.csv")
    stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
                        "master/moving_squares.csv")

    X, y = stream.next_sample(stream.n_samples)

    logging.info('%d samples' % X.shape[0])
    logging.info('%d dimensions' % X.shape[1])
    run(X[:], y[:], hyperParams)

Esempio n. 24
0
# Example 1: popularity recommender
from skmultiflow.data import FileStream
from evaluation.evaluate_prequential import EvaluatePrequential
from recommendation.random import RandomClassifier
from recommendation.popular import PopularClassifier

# Create stream
stream = FileStream("your-dataset.csv")
stream.prepare_for_use()

# Instantiate recommender
popular = PopularClassifier(sliding_window=True)

# Configure evaluator
evaluator = EvaluatePrequential(session_column_index=0,
                                rec_size=10,
                                pretrain_size=0,
                                n_wait=200,     # evaluation window
                                n_keep=20000,   # observation window
                                max_samples=100000,
                                metrics=['recall', 'mrr', 'running_time'])

# Run evaluation
evaluator.evaluate(stream=stream, model=[popular], model_names=['POP'])
from skmultiflow.evaluation import EvaluatePrequential

from streaming_random_patches_regressor import StreamingRandomPatchesRegressor


###############################################################################
#                                    Options                                  #
###############################################################################
SEED = 123456
n_estimators = 3
aggregation_method = 'median'            # 'median', 'mean'
drift_detection_criteria= 'prediciton'   # 'error', 'prediction'
subspace_mode = "randompatches"          # "randomsubspaces", "resampling", "randompatches"
###############################################################################

stream = FileStream('datasets/cal_housing.csv')

SRPR = StreamingRandomPatchesRegressor(n_estimators=n_estimators,
                                       aggregation_method=aggregation_method,
                                       random_state=SEED)
HTR = HoeffdingTreeRegressor(random_state=SEED)  # , leaf_prediction='mean')

evaluator = EvaluatePrequential(pretrain_size=0,
                                show_plot=True,
                                metrics=['mean_square_error',
                                         'mean_absolute_error',
                                         'true_vs_predicted']
                                )

evaluator.evaluate(stream=stream, model=[SRPR, HTR], model_names=['SRP-Reg', 'HT-Reg'])
Esempio n. 26
0
parser.add_argument('-d',
                    '--dataset',
                    required=False,
                    default="sea_gen",
                    help="Name of Detector {KD3/Adwin/PageHinkley}")
parser.add_argument('-s',
                    '--label_size',
                    required=False,
                    default=0.25,
                    help="Name of Detector {KD3/Adwin/PageHinkley}")

args = parser.parse_args()

test_dataset = args.dataset
print("dataset:" + "datasets/" + test_dataset + '.csv')
stream = FileStream("datasets/" + test_dataset + '.csv')
#print(stream.get_target_values())

onlineBoosting = OnlineBoostingClassifier()
knn_adwin = KNNADWINClassifier(n_neighbors=8,
                               leaf_size=40,
                               max_window_size=1000)
SAMKNN = SAMKNNClassifier(n_neighbors=10,
                          weighting='distance',
                          max_window_size=500,
                          stm_size_option='maxACCApprox',
                          use_ltm=False)
learn_pp_nse = LearnPPNSEClassifier()
SGD = SGDClassifier()
rslvq = RobustSoftLearningVectorQuantization()
#CMMM2 = CMGMMClassifier(classes=stream.get_target_values(), prune_component=True, drift_detector=None)
Esempio n. 27
0
    for i in range(r):
        pred = classifier.predict(np.asarray([X[i]]))
        predicted_labels.append(pred[0])
        true_labels.append(y[i])
        classifier = classifier.partial_fit(np.asarray([X[i]]), np.asarray([y[i]]), None)
        if (i % (r // 20)) == 0:
            logging.info(str((i // (r / 20))*5) + "%")
    accuracy = accuracy_score(true_labels, predicted_labels)
    logging.info('error rate %.2f%%' % (100-100*accuracy))


if __name__ == '__main__':
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    hyperParams ={'maxSize': 1000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': 'maxACCApprox', 'useLTM': False}
    # hyperParams = {'windowSize': 5000, 'nNeighbours': 5, 'knnWeights': 'distance', 'STMSizeAdaption': None,
    #               'useLTM': False}

    logging.info('loading dataset')
    # stream = FileStream("../data/datasets/weather.csv")
    stream = FileStream("../data/datasets/moving_squares.csv")
    stream = FileStream("/Users/jing/local/scikit-multiflow/src/skmultiflow/data/datasets/covtype.csv")

    stream.prepare_for_use()

    X, y = stream.next_sample(stream.n_samples)

    logging.info('%d samples' % X.shape[0])
    logging.info('%d dimensions' % X.shape[1])
    run(X[:], y[:], hyperParams)

Esempio n. 28
0
def test_evaluate_and_adapt_trees():

    expected_accuracies = [
        0.86, 0.876, 0.914, 0.858, 0.77, 0.894, 0.876, 0.91, 0.898, 0.884,
        0.804, 0.808
    ]

    expected_trees = [30, 60, 30, 60, 30]

    # Load the meta-model
    dictMeta = {
        0.0: 60,
        0.1: 30,
        0.2: 30,
        0.3: 30,
        0.4: 60,
        0.5: 70,
        0.6: 60,
        0.7: 30,
        0.8: 30,
        0.9: 30
    }  # dict = {'pourc redund feat':best nb tree}

    n_trees = 10
    n_samples_max = 6000
    n_samples_meas = 500

    stream = FileStream('./recurrent-data/real-world/elec.csv')

    stream.prepare_for_use()

    # Evaluate model (with adaptation or not)
    arf = AdaptiveRandomForest(n_estimators=n_trees,
                               lambda_value=6,
                               grace_period=10,
                               split_confidence=0.1,
                               tie_threshold=0.005,
                               warning_detection_method=ADWIN(delta=0.01),
                               drift_detection_method=ADWIN(delta=0.001),
                               random_state=0)

    modelsList = [arf]
    modelsNames = ['ARF']

    evaluator = EvaluatePrequentialAndAdaptTreesARF(
        metrics=['accuracy', 'kappa', 'running_time', 'ram_hours'],
        show_plot=False,
        n_wait=n_samples_meas,
        pretrain_size=200,
        max_samples=n_samples_max,
        output_file=None,
        metaKB=dictMeta)

    # Run evaluation
    model, acc, n_trees = evaluator.evaluate(stream=stream,
                                             model=modelsList,
                                             model_names=modelsNames)

    assert np.alltrue(acc[0] == expected_accuracies)

    assert np.alltrue(n_trees[0] == expected_trees)
from skmultiflow.data import FileStream
from skmultiflow.evaluation import EvaluatePrequential
from skmultiflow.bayes import NaiveBayes
from skmultiflow.meta import OzaBagging

from sklearn.datasets import make_classification

with open("dataset_imb.csv", "w") as f:
    X, y = make_classification(
        n_features=10, n_informative=10, n_redundant=0, n_samples=10000, weights=[0.5]
    )
    for i in range(X.shape[0]):
        for att in X[i]:
            f.write(str(att) + ",")
        f.write(str(y[i]) + "\n")


generator = FileStream("dataset_imb.csv")

dpdes = DPDESMethod(NaiveBayes(), 200, 10, KNORAU())
ozabag = OzaBagging(NaiveBayes(), n_estimators=10)

evaluator = EvaluatePrequential(
    max_samples=10000,
    n_wait=200,
    batch_size=200,
    pretrain_size=0,
    metrics=["precision"],
)
evaluator.evaluate(generator, [dpdes, ozabag], ["DPDES", "Ozabag"])
Esempio n. 30
0
def demo():
    """ _test_kdtree_compare
    
    This demo compares creation and query speed for different kd tree 
    implementations. They are fed with instances from the covtype dataset. 
    
    Three kd tree implementations are compared: SciPy's KDTree, NumPy's 
    KDTree and scikit-multiflow's KDTree. For each of them the demo will 
    time the construction of the tree on 1000 instances, and then measure 
    the time to query 100 instances. The results are displayed in the 
    terminal.
    
    """
    warnings.filterwarnings("ignore", ".*Passing 1d.*")

    stream = FileStream('../data/datasets/covtype.csv', -1, 1)

    filter = OneHotToCategorical([[10, 11, 12, 13],
                                  [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                   34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    X, y = stream.next_sample(1000)
    X = filter.transform(X)
    # print(X)

    X_find, y = stream.next_sample(100)
    X_find = filter.transform(X_find)
    print(X_find[4])
    # Normal kdtree
    start = timer()
    scipy = spatial.KDTree(X, leafsize=40)
    end = timer()
    print("\nScipy KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(10):
        ind = scipy.query(X_find[i], 8)
        # print(ind)
    end = timer()
    print("Scipy KDTree query time: " + str(end - start))

    del scipy

    # Fast kdtree
    start = timer()
    opt = KDTree(X, metric='euclidean', return_distance=True)
    end = timer()
    print("\nOptimal KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = opt.query(X_find[i], 8)
        # print(ind)
        # print(dist)
    end = timer()
    print("Optimal KDTree query time: " + str(end - start))

    del opt

    # Sklearn kdtree
    start = timer()
    sk = ng.KDTree(X, metric='euclidean')
    end = timer()
    print("\nSklearn KDTree construction time: " + str(end-start))

    start = timer()
    for i in range(100):
        ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True)
        # print(ind)
        # print(dist)
    end = timer()
    print("Sklearn KDTree query time: " + str(end - start) + "\n")

    del sk