Python Pipeline Examples, skmultiflow.core.pipeline.Pipeline Python Examples

Example #1

0

Show file

def demo():
    """ _test_mol

    This demo tests the MOL learner on a file stream, which reads from 
    the music.csv file.

    The test computes the performance of the MOL learner as well as 
    the time to create the structure and classify all the samples in 
    the file.

    """
    # Setup logging
    logging.basicConfig(format='%(message)s', level=logging.INFO)

    # Setup the file stream
    opt = FileOption("FILE", "OPT_NAME", "../datasets/music.csv", "CSV", False)
    stream = FileStream(opt, 0, 6)
    stream.prepare_for_use()

    # Setup the classifier, by default it uses Logistic Regression
    #classifier = MultiOutputLearner()
    #classifier = MultiOutputLearner(h=SGDClassifier(n_iter=100))
    classifier = MultiOutputLearner(h=Perceptron())

    # Setup the pipeline
    pipe = Pipeline([('classifier', classifier)])

    pretrain_size = 150
    logging.info('Pre training on %s samples', str(pretrain_size))
    X, y = stream.next_instance(pretrain_size)
    #classifier.fit(X, y)
    pipe.partial_fit(X, y, classes=stream.get_classes())
    count = 0
    true_labels = []
    predicts = []
    init_time = timer()
    logging.info('Evaluating...')
    while stream.has_more_instances():
        X, y = stream.next_instance()
        #p = classifier.predict(X)
        p = pipe.predict(X)
        predicts.extend(p)
        true_labels.extend(y)
        count += 1
    perf = hamming_score(true_labels, predicts)
    logging.info('Evaluation time: %s s', str(timer() - init_time))
    logging.info('Total samples analyzed: %s', str(count))
    logging.info('The classifier\'s static Hamming score    : %0.3f' % perf)

Example #2

0

Show file

File: _test_mtr.py Project: lengfab/scikit-multiflow

def demo(input_file, output_file=None):
    """ _test_mtr_regression

    This demo demonstrates how to evaluate a Multi-Target Regressor. The
    employed dataset is 'scm1d', which is contained in the data folder.

    Parameters
    ----------
    input_file: string
        A string describind the path for the input dataset

    output_file: string
        The name of the csv output file

    """
    stream = RegressionGenerator(n_samples=5000, n_features=20,
                                 n_informative=15, random_state=1,
                                 n_targets=7)
    stream.prepare_for_use()

    classifier = MultiTargetRegressionHoeffdingTree(leaf_prediction='adaptive')

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=1, batch_size=1, n_wait=200,
                                    max_time=1000, output_file=output_file,
                                    show_plot=False,
                                    metrics=['average_mean_square_error',
                                             'average_mean_absolute_error',
                                             'average_root_mean_square_error'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

Example #3

0

Show file

File: _test_comparison_prequential.py Project: lfleck/scikit-multiflow

def demo(instances=2000):
    """ _test_comparison_prequential
    
    This demo will test a prequential evaluation when more than one learner is 
    passed, which makes it a comparison task.
    
    Parameters
    ----------
    instances: int
        The evaluation's maximum number of instances.
     
    """
    # Stream setup
    stream = FileStream("../datasets/covtype.csv", -1, 1)
    # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False)
    stream.prepare_for_use()
    # Setup the classifier
    clf = SGDClassifier()
    # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None)
    # classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    clf_one = KNNAdwin(k=8, max_window_size=1000, leaf_size=30)
    # clf_two = KNN(k=8, max_window_size=1000, leaf_size=30)
    # clf_two = LeverageBagging(h=KNN(), ensemble_length=2)

    t_one = OneHotToCategorical([[10, 11, 12, 13],
                                 [
                                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                     24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                     34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                                     44, 45, 46, 47, 48, 49, 50, 51, 52, 53
                                 ]])
    # t_two = OneHotToCategorical([[10, 11, 12, 13],
    #                        [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)])
    # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)])

    classifier = [clf, pipe_one]
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    # pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=2000,
                                    output_file='teste.csv',
                                    max_samples=instances,
                                    batch_size=1,
                                    n_wait=200,
                                    max_time=1000,
                                    show_plot=True,
                                    metrics=['performance', 'kappa_t'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)

Example #4

0

Show file

File: _test_prequential.py Project: mkobbi/scikit-multiflow

def demo(output_file=None, instances=40000):
    """ _test_prequential
    
    This demo shows how to produce a prequential evaluation.
    
    The first thing needed is a stream. For this case we use a file stream 
    which gets its samples from the sea_big.csv file, inside the datasets 
    folder.
    
    Then we need to setup a classifier, which in this case is an instance 
    of sklearn's PassiveAggressiveClassifier. Then, optionally we create a 
    pipeline structure, initialized on that classifier.
    
    The evaluation is then run.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
    
    """
    # Setup the File Stream
    #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_big.csv", "CSV",
                     False)
    stream = FileStream(opt, -1, 1)
    #stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    #classifier = SGDClassifier()
    # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None)
    #classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    classifier = PassiveAggressiveClassifier()
    #classifier = SGDRegressor()
    #classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(
        pretrain_size=200,
        max_instances=instances,
        batch_size=1,
        n_wait=100,
        max_time=1000,
        output_file=output_file,
        task_type='classification',
        show_plot=True,
        plot_options=['kappa', 'kappa_t', 'performance'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)

Example #5

0

Show file

File: _test_prequential.py Project: houcembenmakhlouf/GHVFDT

def demo(output_file=None, instances=40000):
    """ _test_prequential
    
    This demo shows how to produce a prequential evaluation.
    
    The first thing needed is a stream. For this case we use a file stream 
    which gets its samples from the sea_big.csv file.
    
    Then we need to setup a classifier, which in this case is an instance 
    of sklearn's PassiveAggressiveClassifier. Then, optionally we create a 
    pipeline structure, initialized on that classifier.
    
    The evaluation is then run.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
    
    """
    # Setup the File Stream
    # stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
    #                     "master/sea_big.csv")
    # stream = WaveformGenerator()

    # Setup the classifier
    # classifier = SGDClassifier()
    # classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None)
    # classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000,
    #                                        leaf_size=30))
    classifier = PassiveAggressiveClassifier()
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=200,
        max_samples=instances,
        batch_size=1,
        n_wait=100,
        max_time=1000,
        output_file=output_file,
        show_plot=True,
        metrics=['kappa', 'kappa_t', 'performance'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

Example #6

0

Show file

def test_batch_incremental():
    stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112)
    stream.prepare_for_use()
    estimator = DecisionTreeClassifier(random_state=112)
    classifier = BatchIncremental(base_estimator=estimator, n_estimators=10)

    learner = Pipeline([('classifier', classifier)])

    X, y = stream.next_sample(150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1

    performance = correct_predictions / len(predictions)
    expected_predictions = [1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0,
                            0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0,
                            0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                            0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                            0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0]

    expected_correct_predictions = 31
    expected_performance = 0.6326530612244898

    assert np.alltrue(predictions == expected_predictions)
    assert np.isclose(expected_performance, performance)
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray

Example #7

0

Show file

File: _test_holdout.py Project: abifet/scikit-multiflow

def demo(output_file=None, instances=40000):
    """ _test_holdout
    
    This demo runs a holdout evaluation task with one learner. The default 
    stream is a WaveformGenerator. The default learner is a SGDClassifier, 
    which is inserted into a Pipeline structure. All the default values can 
    be changing by uncommenting/commenting the code below.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
         
    """
    # Setup the File Stream
    #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    #stream = FileStream(opt, -1, 1)
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    classifier = SGDClassifier()
    #classifier = PassiveAggressiveClassifier()
    #classifier = SGDRegressor()
    #classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluateHoldout(pretrain_size=10000,
                           test_size=2000,
                           dynamic_test_set=True,
                           max_instances=instances,
                           batch_size=1,
                           n_wait=15000,
                           max_time=1000,
                           output_file=output_file,
                           task_type='classification',
                           show_plot=True,
                           plot_options=['kappa', 'kappa_t', 'performance'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)

Example #8

0

Show file

File: _test_prequential_mol.py Project: yupbank/scikit-multiflow

def demo(output_file=None, instances=40000):
    """ _test_prequential_mol

    This demo shows the evaluation process of a MOL classifier, initialized 
    with sklearn's SGDClassifier.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    #opt = FileOption("FILE", "OPT_NAME", "../datasets/music.csv", "CSV", False)
    #stream = FileStream(opt, 0, 6)
    stream = MultilabelGenerator(n_samples=instances)
    #stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    classifier = MultiOutputLearner(SGDClassifier(n_iter=100))
    #classifier = SGDClassifier()
    #classifier = PassiveAggressiveClassifier()
    #classifier = SGDRegressor()
    #classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(
        pretrain_size=5000,
        max_instances=instances - 10000,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        output_file=output_file,
        task_type='multi_output',
        show_plot=True,
        plot_options=['hamming_score', 'j_index', 'exact_match'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)

Example #9

0

Show file

def demo(output_file=None, instances=40000):
    """ _test_prequential_bagging
    
    This demo shows the evaluation process of a LeverageBagging classifier, 
    initialized with KNN classifiers.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
    
    """
    # Setup the File Stream
    # opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_big.csv", "CSV", False)
    # stream = FileStream(opt, -1, 1)
    stream = SEAGenerator(classification_function=2,
                          instance_seed=755437,
                          noise_percentage=0.0)
    stream.prepare_for_use()

    # Setup the classifier
    #classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    classifier = LeverageBagging(h=KNN(k=8, max_window_size=2000,
                                       leaf_size=30),
                                 ensemble_length=1)

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(
        pretrain_size=2000,
        max_instances=instances,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        output_file=output_file,
        task_type='classification',
        show_plot=True,
        plot_options=['kappa', 'kappa_t', 'performance'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)

Example #10

0

Show file

File: _test_regression.py Project: yupbank/scikit-multiflow

def demo(output_file=None, instances=40000):
    """ _test_regression

    This demo demonstrates how to evaluate a regressor. The data stream used 
    is an instance of the RegressionGenerator, which feeds an instance from 
    sklearn's SGDRegressor.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    #stream = FileStream(opt, -1, 1)
    #stream = WaveformGenerator()
    #stream.prepare_for_use()
    stream = RegressionGenerator(n_samples=40000)
    # Setup the classifier
    #classifier = SGDClassifier()
    #classifier = PassiveAggressiveClassifier()
    classifier = SGDRegressor()
    #classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(pretrain_size=1,
                               max_instances=instances,
                               batch_size=1,
                               n_wait=1,
                               max_time=1000,
                               output_file=output_file,
                               task_type='regression',
                               show_plot=True,
                               plot_options=['true_vs_predicts'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)

Example #11

0

Show file

def demo(output_file=None, instances=40000):
    """ _test_prequential_mol

    This demo shows the evaluation process of a MOL classifier, initialized 
    with sklearn's SGDClassifier.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    stream = MultilabelGenerator(n_samples=instances)
    # stream = WaveformGenerator()

    # Setup the classifier
    classifier = MultiOutputLearner(SGDClassifier(n_iter=100))
    # classifier = SGDClassifier()
    # classifier = PassiveAggressiveClassifier()
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=5000,
        max_samples=instances - 10000,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        output_file=output_file,
        show_plot=True,
        metrics=['hamming_score', 'j_index', 'exact_match'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

Example #12

0

Show file

File: _test_regression.py Project: zhouyonglong/scikit-multiflow

def demo(output_file=None, instances=40000):
    """ _test_regression

    This demo demonstrates how to evaluate a regressor. The data stream used
    is an instance of the RegressionGenerator, which feeds an instance from
    sklearn's SGDRegressor.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    # stream = FileStream("../data/datasets/covtype.csv", -1, 1)
    # stream = WaveformGenerator()
    # stream.prepare_for_use()
    stream = RegressionGenerator(n_samples=40000)
    # Setup the classifier
    # classifier = SGDClassifier()
    # classifier = PassiveAggressiveClassifier()
    classifier = RegressionHoeffdingTree()
    # classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=1,
                                    max_samples=instances,
                                    batch_size=1,
                                    n_wait=200,
                                    max_time=1000,
                                    output_file=output_file,
                                    show_plot=False,
                                    metrics=['mean_square_error'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

Example #13

0

Show file

def demo():
    """ _test_pipeline
    
    This demo demonstrates the Pipeline structure seemingly working as a 
    learner, while being passed as parameter to an EvaluatePrequential 
    object.
     
    """
    # # Setup the stream
    # opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    # stream = FileStream(opt, -1, 1)
    # stream.prepare_for_use()
    # # If used for Hoeffding Trees then need to pass indices for Nominal attributes

    # Test with RandomTreeGenerator
    # stream = RandomTreeGenerator(n_classes=2, n_numerical_attributes=5)
    # stream.prepare_for_use()

    # Test with WaveformGenerator
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    #classifier = PerceptronMask()
    #classifier = NaiveBayes()
    #classifier = PassiveAggressiveClassifier()
    classifier = HoeffdingTree()

    # Setup the pipeline
    pipe = Pipeline([('Hoeffding Tree', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(show_plot=True,
                               pretrain_size=1000,
                               max_instances=100000)

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)

Example #14

0

Show file

# Setup the File Stream
#opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
#opt = FileOption("FILE", "OPT_NAME", "../datasets/movingSquares.csv", "CSV", False)
opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_stream.csv", "CSV",
                 False)

stream = FileStream(opt, -1, 1)
stream.prepare_for_use()

# Setup the classifiers
clf_one = BernoulliNB()
clf_two = AdaptiveRandomForest()

# Setup the pipeline for clf_one
pipe = Pipeline([('classifier', clf_one)])

# Create the list to hold both classifiers
classifier = [pipe, clf_two]

# Setup the evaluator
eval = EvaluatePrequential(pretrain_size=200,
                           max_instances=100000,
                           batch_size=1,
                           max_time=1000,
                           output_file='comparison_Bernoulli_ADFH_Preq.csv',
                           task_type='classification',
                           show_plot=True,
                           plot_options=['kappa', 'kappa_t', 'performance'])

# Evaluate