Beispiel #1
0
def main():
    p = Pipeline([
        ('step1', MultiplyByN()),
        ('step2', MultiplyByN()),
        Pipeline([
            Identity(),
            Identity(),
            PCA(n_components=4)
        ])
    ])

    p.set_hyperparams_space({
        'step1__multiply_by': RandInt(42, 50),
        'step2__multiply_by': RandInt(-10, 0),
        'Pipeline__PCA__n_components': RandInt(2, 3)
    })

    samples = p.get_hyperparams_space().rvs()
    p.set_hyperparams(samples)

    samples = p.get_hyperparams().to_flat_as_dict_primitive()
    assert 42 <= samples['step1__multiply_by'] <= 50
    assert -10 <= samples['step2__multiply_by'] <= 0
    assert samples['Pipeline__PCA__n_components'] in [2, 3]
    assert p['Pipeline']['PCA'].get_wrapped_sklearn_predictor().n_components in [2, 3]
def main():
    """
    Process tasks of batch size 10 with 8 queued workers that have a max queue size of 10.
    Each task doest the following: For each data input, sleep 0.02 seconds, and multiply by 2.
    """
    sleep_time = 0.02
    p = SequentialQueuedPipeline([
        Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]),
    ], n_workers_per_step=8, max_queue_size=10, batch_size=10)

    a = time.time()
    outputs_streaming = p.transform(list(range(100)))
    b = time.time()
    time_queued_pipeline = b - a
    print('SequentialQueuedPipeline')
    print('execution time: {} seconds'.format(time_queued_pipeline))

    """
    Process data inputs sequentially. 
    For each data input, sleep 0.02 seconds, and then multiply by 2.
    """
    p = Pipeline([
        Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]),
    ])

    a = time.time()
    outputs_vanilla = p.transform(list(range(100)))
    b = time.time()
    time_vanilla_pipeline = b - a

    print('VanillaPipeline')
    print('execution time: {} seconds'.format(time_vanilla_pipeline))

    assert time_queued_pipeline < time_vanilla_pipeline
    assert np.array_equal(outputs_streaming, outputs_vanilla)
Beispiel #3
0
def test_resumable_pipeline_fit_transform_should_save_all_fitted_pipeline_steps(
        tmpdir: LocalPath):
    p = ResumablePipeline(
        [(SOME_STEP_1, MultiplyByN(multiply_by=2)),
         (PIPELINE_2,
          ResumablePipeline([(SOME_STEP_2, MultiplyByN(multiply_by=4)),
                             (CHECKPOINT, DefaultCheckpoint()),
                             (SOME_STEP_3, MultiplyByN(multiply_by=6))]))],
        cache_folder=tmpdir)
    p.name = ROOT

    p, outputs = p.fit_transform(np.array(range(10)), np.array(range(10)))

    not_saved_paths = [create_some_step3_path(tmpdir)]
    saved_paths = [
        create_root_path(tmpdir),
        create_pipeline2_path(tmpdir),
        create_some_step1_path(tmpdir),
        create_some_step2_path(tmpdir),
        create_some_checkpoint_path(tmpdir)
    ]
    assert np.array_equal(outputs, EXPECTED_OUTPUTS)
    for p in saved_paths:
        assert os.path.exists(p)
    for p in not_saved_paths:
        assert not os.path.exists(p)
def test_automl_sequential_wrapper(tmpdir):
    # Given
    data_inputs = np.array(range(100))
    expected_outputs = np.array(range(100, 200))

    hyperparameter_space = HyperparameterSpace({
        'multiplication_1__multiply_by':
        RandInt(1, 3),
        'multiplication_2__multiply_by':
        RandInt(1, 3),
        'multiplication_3__multiply_by':
        RandInt(1, 3),
    })

    pipeline = Pipeline(
        [('multiplication_1', MultiplyByN()),
         ('multiplication_2', MultiplyByN()),
         ('multiplication_3', MultiplyByN())],
        cache_folder=tmpdir).set_hyperparams_space(hyperparameter_space)

    auto_ml = RandomSearch(
        KFoldCrossValidationWrapper().set_step(pipeline),
        hyperparams_repository=HyperparamsJSONRepository(tmpdir),
        n_iter=10)

    # When
    auto_ml: AutoMLSequentialWrapper = auto_ml.fit(data_inputs,
                                                   expected_outputs)
    best_model: Pipeline = auto_ml.get_best_model()
    predicted_outputs = best_model.transform(data_inputs)

    # Then
    actual_mse = ((predicted_outputs - expected_outputs)**2).mean()
    assert actual_mse < 20000
Beispiel #5
0
def test_apply_on_pipeline_with_meta_step_and_positional_argument_should_call_method_on_each_steps():
    pipeline = Pipeline([OutputTransformerWrapper(MultiplyByN(1)), MultiplyByN(1)])

    pipeline.apply('set_hyperparams', hyperparams=HyperparameterSamples({'multiply_by': 2}))

    assert pipeline.get_hyperparams()['multiply_by'] == 2
    assert pipeline['OutputTransformerWrapper'].wrapped.get_hyperparams()['multiply_by'] == 2
    assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2
Beispiel #6
0
def test_apply_on_pipeline_with_positional_argument_should_call_method_on_each_steps():
    pipeline = Pipeline([MultiplyByN(1), MultiplyByN(1)])

    pipeline.apply('set_hyperparams', hyperparams=HyperparameterSamples({'multiply_by': 2}))

    assert pipeline.get_hyperparams()['multiply_by'] == 2
    assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2
    assert pipeline['MultiplyByN1'].get_hyperparams()['multiply_by'] == 2
Beispiel #7
0
def test_queued_pipeline_with_n_workers_step():
    p = SequentialQueuedPipeline([(1, MultiplyByN(2)), (1, MultiplyByN(2)),
                                  (1, MultiplyByN(2)), (1, MultiplyByN(2))],
                                 batch_size=10,
                                 max_queue_size=5)

    outputs = p.transform(list(range(100)))

    assert np.array_equal(outputs, EXPECTED_OUTPUTS)
Beispiel #8
0
def test_apply_method_on_pipeline_should_call_method_on_each_steps():
    pipeline = Pipeline([MultiplyByN(1), MultiplyByN(1)])

    pipeline.apply_method(lambda step: step.set_hyperparams(
        HyperparameterSamples({'multiply_by': 2})))

    assert pipeline.get_hyperparams()['multiply_by'] == 2
    assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2
    assert pipeline['MultiplyByN1'].get_hyperparams()['multiply_by'] == 2
Beispiel #9
0
def test_queued_pipeline_with_step_name_n_worker_max_queue_size():
    p = SequentialQueuedPipeline([('1', 1, 5, MultiplyByN(2)),
                                  ('2', 1, 5, MultiplyByN(2)),
                                  ('3', 1, 5, MultiplyByN(2)),
                                  ('4', 1, 5, MultiplyByN(2))],
                                 batch_size=10)

    outputs = p.transform(list(range(100)))

    assert np.array_equal(outputs, EXPECTED_OUTPUTS)
Beispiel #10
0
def test_apply_method_on_pipeline_with_meta_step_should_call_method_on_each_steps():
    pipeline = Pipeline([OutputTransformerWrapper(MultiplyByN(1)), MultiplyByN(1)])

    pipeline.apply_method(
        lambda step: step.set_hyperparams(HyperparameterSamples({'multiply_by': 2}))
    )

    assert pipeline.get_hyperparams()['multiply_by'] == 2
    assert pipeline['OutputTransformerWrapper'].wrapped.get_hyperparams()['multiply_by'] == 2
    assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2
Beispiel #11
0
def test_parallel_queued_pipeline_with_step_name_n_worker_max_queue_size():
    p = ParallelQueuedFeatureUnion([('1', 1, 5, MultiplyByN(2)),
                                    ('2', 1, 5, MultiplyByN(2)),
                                    ('3', 1, 5, MultiplyByN(2)),
                                    ('4', 1, 5, MultiplyByN(2))],
                                   batch_size=10)

    outputs = p.transform(list(range(100)))

    assert np.array_equal(outputs, EXPECTED_OUTPUTS_PARALLEL)
Beispiel #12
0
def main():
    p = Pipeline([MultiplyByN(2), MultiplyByN(4)])

    outputs = p.transform(list(range(10)))
    print('transform: {}'.format(outputs))

    p = p.mutate(new_method='inverse_transform',
                 method_to_assign_to='transform')

    outputs = p.transform(list(range(10)))
    print('inverse_transform: {}'.format(outputs))
Beispiel #13
0
def test_queued_pipeline_with_excluded_incomplete_batch():
    p = SequentialQueuedPipeline([
        MultiplyByN(2),
        MultiplyByN(2),
        MultiplyByN(2),
        MultiplyByN(2)
    ], batch_size=10, include_incomplete_batch=False, n_workers_per_step=1, max_queue_size=5)

    outputs = p.transform(list(range(15)))

    assert np.array_equal(outputs, np.array(list(range(10))) * 2 * 2 * 2 * 2)
Beispiel #14
0
def test_predict_should_predict_in_test_mode():
    tape_fit = TapeCallbackFunction()
    tape_transform = TapeCallbackFunction()
    p = Pipeline([
        TestOnlyWrapper(
            CallbackWrapper(MultiplyByN(2), tape_transform, tape_fit)),
        TrainOnlyWrapper(
            CallbackWrapper(MultiplyByN(4), tape_transform, tape_fit))
    ])

    outputs = p.predict(np.array([1, 1]))

    assert np.array_equal(outputs, np.array([2, 2]))
Beispiel #15
0
def test_parallel_queued_parallelize_correctly():
    sleep_time = 0.001
    p = SequentialQueuedPipeline([
        ('1', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])),
        ('2', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])),
        ('3', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])),
        ('4', 4, 10, Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]))
    ], batch_size=10)

    a = time.time()
    outputs_streaming = p.transform(list(range(100)))
    b = time.time()
    time_queued_pipeline = b - a

    p = Pipeline([
        Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]),
        Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]),
        Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]),
        Pipeline([ForEachDataInput(Sleep(sleep_time=sleep_time)), MultiplyByN(2)])
    ])

    a = time.time()
    outputs_vanilla = p.transform(list(range(100)))
    b = time.time()
    time_vanilla_pipeline = b - a

    assert time_queued_pipeline < time_vanilla_pipeline
    assert np.array_equal(outputs_streaming, outputs_vanilla)
Beispiel #16
0
def test_predict_should_transform_with_initial_is_train_mode_after_predict():
    tape_fit = TapeCallbackFunction()
    tape_transform = TapeCallbackFunction()
    p = Pipeline([
        TestOnlyWrapper(
            CallbackWrapper(MultiplyByN(2), tape_transform, tape_fit)),
        TrainOnlyWrapper(
            CallbackWrapper(MultiplyByN(4), tape_transform, tape_fit))
    ])

    p.predict(np.array([1, 1]))
    outputs = p.transform(np.array([1, 1]))

    assert np.array_equal(outputs, np.array([4, 4]))
def test_flatten_for_each_should_transform_data_inputs_and_expected_outputs():
    p = FlattenForEach(Pipeline([
        MultiplyByN(2),
        OutputTransformerWrapper(MultiplyByN(3))
    ]))
    # TODO: should use a tape here and ensure that the MultiplyByN received a flat 12 shape only once and not 3*4 things
    data_inputs, expected_outputs = _create_random_of_shape(DATA_SHAPE)

    p, outputs = p.handle_fit_transform(
        DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs), ExecutionContext())

    assert np.array(outputs.data_inputs).shape == DATA_SHAPE
    assert np.array_equal(outputs.data_inputs, data_inputs * 2)
    assert np.array(outputs.expected_outputs).shape == DATA_SHAPE
    assert np.array_equal(outputs.expected_outputs, expected_outputs * 3)
Beispiel #18
0
def test_handle_predict_should_predict_in_test_mode():
    tape_fit = TapeCallbackFunction()
    tape_transform = TapeCallbackFunction()
    p = Pipeline([
        TestOnlyWrapper(
            CallbackWrapper(MultiplyByN(2), tape_transform, tape_fit)),
        TrainOnlyWrapper(
            CallbackWrapper(MultiplyByN(4), tape_transform, tape_fit))
    ])

    data_container = p.handle_predict(data_container=DataContainer(
        data_inputs=np.array([1, 1]), expected_outputs=np.array([1, 1])),
                                      context=ExecutionContext())

    assert np.array_equal(data_container.data_inputs, np.array([2, 2]))
Beispiel #19
0
def test_automl_early_stopping_callback(tmpdir):
    # TODO: fix this unit test
    # Given
    hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir))
    n_epochs = 60
    auto_ml = AutoML(
        pipeline=Pipeline([
            FitTransformCallbackStep().set_name('callback'),
            MultiplyByN(2).set_hyperparams_space(
                HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
            NumpyReshape(new_shape=(-1, 1)),
            linear_model.LinearRegression()
        ]),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(0.20),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
        ],
        n_trials=1,
        refit_trial=True,
        epochs=n_epochs,
        hyperparams_repository=hp_repository)

    # When
    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 2
    auto_ml = auto_ml.fit(data_inputs=data_inputs,
                          expected_outputs=expected_outputs)

    # Then
    p = auto_ml.get_best_model()
Beispiel #20
0
def test_step_cloner_should_fit_transform():
    # Given
    tape = TapeCallbackFunction()
    p = StepClonerForEachDataInput(
        Pipeline([FitCallbackStep(tape), MultiplyByN(2)]))
    data_inputs = _create_data((2, 2))
    expected_outputs = _create_data((2, 2))

    # When
    p, processed_outputs = p.fit_transform(data_inputs, expected_outputs)

    # Then
    assert isinstance(p.steps[0], Pipeline)
    assert np.array_equal(p.steps[0][0].callback_function.data[0][0],
                          data_inputs[0])
    assert np.array_equal(p.steps[0][0].callback_function.data[0][1],
                          expected_outputs[0])

    assert isinstance(p.steps[1], Pipeline)
    assert np.array_equal(p.steps[1][0].callback_function.data[0][0],
                          data_inputs[1])
    assert np.array_equal(p.steps[1][0].callback_function.data[0][1],
                          expected_outputs[1])

    assert np.array_equal(processed_outputs, data_inputs * 2)
Beispiel #21
0
def test_logger():
    file_path = "test.log"

    if os.path.exists(file_path):
        os.remove(file_path)

    # Given
    logger = logging.getLogger('test')
    file_handler = logging.FileHandler(file_path)
    file_handler.setLevel('DEBUG')
    logger.addHandler(file_handler)
    logger.setLevel('DEBUG')
    context = ExecutionContext(logger=logger)
    pipeline = Pipeline([
        MultiplyByN(2).set_hyperparams_space(
            HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
        NumpyReshape(new_shape=(-1, 1)),
        LoggingStep()
    ])

    # When
    data_container = DataContainer(
        data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
    pipeline.handle_fit(data_container, context)

    # Then
    assert os.path.exists(file_path)
    with open(file_path) as f:
        l = f.read()

    # Teardown
    file_handler.close()
    os.remove(file_path)
Beispiel #22
0
def main():
    """
    The task is to sleep 0.02 seconds for each data input and then multiply by 2.
    """
    sleep_time = 0.02
    preprocessing_and_model_steps = [ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]

    # Classical pipeline - all at once with one big batch:
    p = Pipeline(preprocessing_and_model_steps)
    time_vanilla_pipeline, output_classical = eval_run_time(p)
    print(f"Classical 'Pipeline' execution time: {time_vanilla_pipeline} seconds.")

    # Classical minibatch pipeline - minibatch size 10:
    p = MiniBatchSequentialPipeline(preprocessing_and_model_steps,
                                    batch_size=10)
    time_minibatch_pipeline, output_minibatch = eval_run_time(p)
    print(f"Minibatched 'MiniBatchSequentialPipeline' execution time: {time_minibatch_pipeline} seconds.")

    # Parallel pipeline - minibatch size 10 with 16 parallel workers per step that
    # have a max queue size of 10 batches between preprocessing and the model:
    p = SequentialQueuedPipeline(preprocessing_and_model_steps,
                                 n_workers_per_step=16, max_queue_size=10, batch_size=10)
    time_parallel_pipeline, output_parallel = eval_run_time(p)
    print(f"Parallel 'SequentialQueuedPipeline' execution time: {time_parallel_pipeline} seconds.")

    assert time_parallel_pipeline < time_minibatch_pipeline, str((time_parallel_pipeline, time_vanilla_pipeline))
    assert np.array_equal(output_classical, output_minibatch)
    assert np.array_equal(output_classical, output_parallel)
Beispiel #23
0
def test_trainer_train():
    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 4
    p = Pipeline([
        MultiplyByN(2).set_hyperparams_space(
            HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
        NumpyReshape(new_shape=(-1, 1)),
        linear_model.LinearRegression()
    ])

    trainer: Trainer = Trainer(
        epochs=10,
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        validation_splitter=ValidationSplitter(test_size=0.20))

    repo_trial: Trial = trainer.train(pipeline=p,
                                      data_inputs=data_inputs,
                                      expected_outputs=expected_outputs)

    trained_pipeline = repo_trial.get_trained_pipeline(split_number=0)

    outputs = trained_pipeline.transform(data_inputs)
    mse = mean_squared_error(expected_outputs, outputs)

    assert mse < 1
Beispiel #24
0
def test_queued_pipeline_with_included_incomplete_batch():
    p = SequentialQueuedPipeline(
        [MultiplyByN(2),
         MultiplyByN(2),
         MultiplyByN(2),
         MultiplyByN(2)],
        batch_size=10,
        keep_incomplete_batch=True,
        default_value_data_inputs=AbsentValuesNullObject(),
        default_value_expected_outputs=AbsentValuesNullObject(),
        n_workers_per_step=1,
        max_queue_size=5)

    outputs = p.transform(list(range(15)))

    assert np.array_equal(outputs, np.array(list(range(15))) * 2 * 2 * 2 * 2)
Beispiel #25
0
def test_queued_pipeline_with_savers(tmpdir):
    # Given

    p = ParallelQueuedFeatureUnion([
        ('1', MultiplyByN(2)),
        ('2', MultiplyByN(2)),
        ('3', MultiplyByN(2)),
        ('4', MultiplyByN(2)),
    ], n_workers_per_step=1, max_queue_size=10, batch_size=10, use_savers=True, cache_folder=tmpdir)

    # When

    outputs = p.transform(list(range(100)))

    # Then

    assert np.array_equal(outputs, EXPECTED_OUTPUTS_PARALLEL)
def test_flatten_for_each_should_transform_data_inputs():
    p = FlattenForEach(MultiplyByN(2))
    data_inputs, expected_outputs = _create_random_of_shape(DATA_SHAPE)

    outputs = p.transform(data_inputs)

    assert np.array(outputs).shape == DATA_SHAPE
    assert np.array_equal(outputs, data_inputs * 2)
Beispiel #27
0
def test_optional_should_disable_wrapped_step_when_disabled():
    p = Optional(MultiplyByN(2), nullified_return_value=[]).set_hyperparams(
        HyperparameterSamples({'enabled': False}))
    data_inputs = np.array(list(range(10)))

    outputs = p.transform(data_inputs)

    assert outputs == []
Beispiel #28
0
def test_queued_pipeline_with_included_incomplete_batch_that_raises_an_exception(
):
    with pytest.raises(AttributeError):
        p = SequentialQueuedPipeline(
            [MultiplyByN(2),
             MultiplyByN(2),
             MultiplyByN(2),
             MultiplyByN(2)],
            batch_size=10,
            keep_incomplete_batch=True,
            default_value_data_inputs=
            None,  # this will raise an exception in the worker
            default_value_expected_outputs=
            None,  # this will raise an exception in the worker
            n_workers_per_step=1,
            max_queue_size=5)
        p.transform(list(range(15)))
Beispiel #29
0
def test_queued_pipeline_with_step_with_threading():
    p = SequentialQueuedPipeline(
        [MultiplyByN(2),
         MultiplyByN(2),
         MultiplyByN(2),
         MultiplyByN(2)],
        batch_size=10,
        n_workers_per_step=1,
        max_queue_size=5,
        use_processes=False)

    data_container = DataContainer(data_inputs=list(range(100)))
    context = ExecutionContext()

    outputs = p.handle_transform(data_container, context)

    assert np.array_equal(outputs.data_inputs, EXPECTED_OUTPUTS)
def main():
    p = Pipeline([MultiplyByN(multiply_by=2)])

    data_inputs = np.array([1, 2])
    generated_outputs = p.transform(data_inputs)
    regenerated_inputs = p.inverse_transform(generated_outputs)

    assert np.array_equal(regenerated_inputs, data_inputs)
    assert np.array_equal(generated_outputs, 2 * data_inputs)