def test_when_hyperparams_and_saved_no_pipeline_should_not_load_checkpoint_pickle(
        tmpdir: LocalPath):
    # Given
    tape = TapeCallbackFunction()
    pickle_checkpoint_step = DefaultCheckpoint()

    # When
    pipeline_save = create_pipeline(tmpdir=tmpdir,
                                    pickle_checkpoint_step=DefaultCheckpoint(),
                                    tape=TapeCallbackFunction(),
                                    hyperparameters=HyperparameterSamples(
                                        {"a__learning_rate": 1}),
                                    different=True,
                                    save_pipeline=False)
    pipeline_save.fit_transform(data_inputs, expected_outputs)

    pipeline_load = create_pipeline(
        tmpdir=tmpdir,
        pickle_checkpoint_step=pickle_checkpoint_step,
        tape=tape,
        hyperparameters=HyperparameterSamples({"a__learning_rate": 1}))
    pipeline_load, actual_data_inputs = pipeline_load.fit_transform(
        data_inputs, expected_outputs)

    # Then
    actual_tape = tape.get_name_tape()
    assert np.array_equal(actual_data_inputs, data_inputs)
    assert actual_tape == ["1", "2", "3"]
Exemple #2
0
def test_when_hyperparams_and_saved_same_pipeline_should_load_checkpoint_pickle(tmpdir: LocalPath):
    # Given
    tape = TapeCallbackFunction()

    # When
    pipeline_save = create_pipeline(
        tmpdir=tmpdir,
        pickle_checkpoint_step=DefaultCheckpoint(),
        tape=TapeCallbackFunction(),
        hyperparameters=HyperparameterSamples({"a__learning_rate": 1})
    )
    pipeline_save.fit_transform(data_inputs, expected_outputs)

    pipeline_load = create_pipeline(
        tmpdir=tmpdir,
        pickle_checkpoint_step=DefaultCheckpoint(),
        tape=tape,
        hyperparameters=HyperparameterSamples({"a__learning_rate": 1})
    )
    pipeline_load, actual_data_inputs = pipeline_load.fit_transform(data_inputs, expected_outputs)

    # Then
    actual_tape = tape.get_name_tape()
    assert np.array_equal(actual_data_inputs, data_inputs)
    assert actual_tape == EXPECTED_TAPE_AFTER_CHECKPOINT
def test_pickle_checkpoint_step_should_load_data_container(tmpdir: LocalPath):
    initial_data_inputs = [1, 2]
    initial_expected_outputs = [2, 3]

    create_pipeline_output_transformer = lambda: ResumablePipeline([
        ('output_transformer_1', MultiplyBy2OutputTransformer()),
        ('pickle_checkpoint', DefaultCheckpoint()),
        ('output_transformer_2', MultiplyBy2OutputTransformer()),
    ],
                                                                   cache_folder
                                                                   =tmpdir)

    create_pipeline_output_transformer().fit_transform(
        data_inputs=initial_data_inputs,
        expected_outputs=initial_expected_outputs)
    transformer = create_pipeline_output_transformer()
    actual_data_container = transformer.handle_transform(
        DataContainer(current_ids=[0, 1],
                      data_inputs=initial_data_inputs,
                      expected_outputs=initial_expected_outputs),
        ExecutionContext.create_from_root(transformer, ExecutionMode.TRANSFORM,
                                          tmpdir))

    assert np.array_equal(actual_data_container.data_inputs, [4, 8])
    assert np.array_equal(actual_data_container.expected_outputs, [8, 12])
def test_when_hyperparams_should_save_checkpoint_pickle(tmpdir: LocalPath):
    tape = TapeCallbackFunction()
    pickle_checkpoint_step = DefaultCheckpoint()
    pipeline = create_pipeline(tmpdir, pickle_checkpoint_step, tape,
                               HyperparameterSamples({"a__learning_rate": 1}))

    pipeline, actual_data_inputs = pipeline.fit_transform(
        data_inputs, expected_outputs)

    actual_tape = tape.get_name_tape()
    assert np.array_equal(actual_data_inputs, data_inputs)
    assert actual_tape == ["1", "2", "3"]

    assert os.path.exists(
        os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'di',
                     '44f9d6dd8b6ccae571ca04525c3eaffa.pickle'))
    assert os.path.exists(
        os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'di',
                     '898a67b2f5eeae6393ca4b3162ba8e3d.pickle'))
    assert os.path.exists(
        os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'eo',
                     '44f9d6dd8b6ccae571ca04525c3eaffa.pickle'))
    assert os.path.exists(
        os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'eo',
                     '898a67b2f5eeae6393ca4b3162ba8e3d.pickle'))
Exemple #5
0
def create_pipeline_output_transformer(tmpdir):
    return ResumablePipeline([
        ('output_transformer_1', MultiplyBy2OutputTransformer()),
        ('joblib_checkpoint', DefaultCheckpoint()),
        ('output_transformer_2', MultiplyBy2OutputTransformer()),
    ],
                             cache_folder=tmpdir)
Exemple #6
0
def test_resumable_pipeline_fit_transform_should_save_all_fitted_pipeline_steps(
        tmpdir: LocalPath):
    p = ResumablePipeline(
        [(SOME_STEP_1, MultiplyByN(multiply_by=2)),
         (PIPELINE_2,
          ResumablePipeline([(SOME_STEP_2, MultiplyByN(multiply_by=4)),
                             (CHECKPOINT, DefaultCheckpoint()),
                             (SOME_STEP_3, MultiplyByN(multiply_by=6))]))],
        cache_folder=tmpdir)
    p.name = ROOT

    p, outputs = p.fit_transform(np.array(range(10)), np.array(range(10)))

    not_saved_paths = [create_some_step3_path(tmpdir)]
    saved_paths = [
        create_root_path(tmpdir),
        create_pipeline2_path(tmpdir),
        create_some_step1_path(tmpdir),
        create_some_step2_path(tmpdir),
        create_some_checkpoint_path(tmpdir)
    ]
    assert np.array_equal(outputs, EXPECTED_OUTPUTS)
    for p in saved_paths:
        assert os.path.exists(p)
    for p in not_saved_paths:
        assert not os.path.exists(p)
Exemple #7
0
def given_saved_pipeline(tmpdir: LocalPath):
    step_savers = [(SOME_STEP_1, []),
                   (PIPELINE_2, [TruncableJoblibStepSaver()])]
    path = create_root_path(tmpdir, True)
    root = ResumablePipeline([], cache_folder=tmpdir)
    root.sub_steps_savers = step_savers
    root.name = ROOT
    dump(root, path)

    pipeline_2 = ResumablePipeline([], cache_folder=tmpdir)
    pipeline_2.name = 'pipeline2'
    pipeline_2.sub_steps_savers = [
        (SOME_STEP_2, []),
        (CHECKPOINT, []),
        (SOME_STEP_3, []),
    ]
    dump(pipeline_2, create_pipeline2_path(tmpdir, True))

    given_saved_some_step(multiply_by=2,
                          name=SOME_STEP_1,
                          path=create_some_step1_path(tmpdir, True))
    given_saved_some_step(multiply_by=4,
                          name=SOME_STEP_2,
                          path=create_some_step2_path(tmpdir, True))
    given_saved_some_step(multiply_by=6,
                          name=SOME_STEP_3,
                          path=create_some_step3_path(tmpdir, True))

    checkpoint = DefaultCheckpoint()
    checkpoint.name = CHECKPOINT
    dump(checkpoint, create_some_checkpoint_path(tmpdir, True))

    p = ResumablePipeline(
        [(SOME_STEP_1, MultiplyByN(multiply_by=1)),
         (PIPELINE_2,
          ResumablePipeline([(SOME_STEP_2, MultiplyByN(multiply_by=1)),
                             (CHECKPOINT, DefaultCheckpoint()),
                             (SOME_STEP_3, MultiplyByN(multiply_by=1))]))],
        cache_folder=tmpdir)
    p.name = ROOT

    return p
Exemple #8
0
def create_checkpoint_test_case(tmpdir):
    tape_transform_1 = TapeCallbackFunction()
    tape_fit_1 = TapeCallbackFunction()
    tape_transform_2 = TapeCallbackFunction()
    tape_fit_2 = TapeCallbackFunction()
    pipeline = ResumablePipeline(
        [('step1', FitTransformCallbackStep(tape_transform_1, tape_fit_1)),
         ('checkpoint', DefaultCheckpoint()),
         ('step2', FitTransformCallbackStep(tape_transform_2, tape_fit_2))],
        cache_folder=tmpdir)

    return CheckpointTest(tape_transform_1, tape_fit_1, tape_transform_2,
                          tape_fit_2, pipeline)
Exemple #9
0
def test_when_no_hyperparams_should_save_checkpoint_pickle(tmpdir: LocalPath):
    tape = TapeCallbackFunction()
    pickle_checkpoint_step = DefaultCheckpoint()
    pipeline = create_pipeline(tmpdir, pickle_checkpoint_step, tape)

    pipeline, actual_data_inputs = pipeline.fit_transform(data_inputs, expected_outputs)

    actual_tape = tape.get_name_tape()
    assert np.array_equal(actual_data_inputs, data_inputs)
    assert actual_tape == ["1", "2", "3"]
    assert os.path.exists(os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'di', '0.pickle'))
    assert os.path.exists(os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'di', '1.pickle'))
    assert os.path.exists(os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'eo', '0.pickle'))
    assert os.path.exists(os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'eo', '1.pickle'))
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10):
    DATA_INPUTS = np.array(range(100))
    EXPECTED_OUTPUTS = np.array(range(100, 200))

    HYPERPARAMETER_SPACE = HyperparameterSpace({
        'multiplication_1__multiply_by': RandInt(1, 2),
        'multiplication_2__multiply_by': RandInt(1, 2),
        'multiplication_3__multiply_by': RandInt(1, 2),
    })

    print('Classic Pipeline:')
    classic_pipeline_folder = os.path.join(str(tmpdir), 'classic')

    pipeline = Pipeline([
        ('multiplication_1', MultiplyByN()),
        ('sleep_1', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_3', MultiplyByN()),
    ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    auto_ml = AutoML(
        pipeline,
        refit_trial=True,
        n_trials=n_iter,
        cache_folder_when_no_handle=classic_pipeline_folder,
        validation_splitter=ValidationSplitter(0.2),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)
        ],
    )
    auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = auto_ml.get_best_model().predict(DATA_INPUTS)
    time_b = time.time()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)

    print('Resumable Pipeline:')
    resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable')

    pipeline = ResumablePipeline([
        ('multiplication_1', MultiplyByN()),
        ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))),
        ('checkpoint1', ExpandDim(DefaultCheckpoint())),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('checkpoint2', ExpandDim(DefaultCheckpoint())),
        ('multiplication_3', MultiplyByN())
    ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    auto_ml = AutoML(
        pipeline,
        refit_trial=True,
        n_trials=n_iter,
        cache_folder_when_no_handle=resumable_pipeline_folder,
        validation_splitter=ValidationSplitter(0.2),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)
        ]
    )
    auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = auto_ml.get_best_model().predict(DATA_INPUTS)
    time_b = time.time()
    pipeline.flush_all_cache()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)
def main(tmpdir, sleep_time: float = 0, n_iter: int = 10):
    DATA_INPUTS = np.array(range(100))
    EXPECTED_OUTPUTS = np.array(range(100, 200))

    HYPERPARAMETER_SPACE = HyperparameterSpace({
        'multiplication_1__multiply_by':
        RandInt(1, 2),
        'multiplication_2__multiply_by':
        RandInt(1, 2),
        'multiplication_3__multiply_by':
        RandInt(1, 2),
    })

    print('Classic Pipeline:')

    pipeline = Pipeline([
        ('multiplication_1', MultiplyByN()),
        ('sleep_1', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_3', MultiplyByN()),
    ]).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    best_model = RandomSearch(pipeline,
                              n_iter=n_iter,
                              higher_score_is_better=True).fit(
                                  DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = best_model.transform(DATA_INPUTS)
    time_b = time.time()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)

    print('Resumable Pipeline:')

    pipeline = ResumablePipeline(
        [('multiplication_1', MultiplyByN()),
         ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))),
         ('checkpoint1', ExpandDim(DefaultCheckpoint())),
         ('multiplication_2', MultiplyByN()),
         ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
         ('checkpoint2', ExpandDim(DefaultCheckpoint())),
         ('multiplication_3', MultiplyByN())],
        cache_folder=tmpdir).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    best_model = RandomSearch(pipeline,
                              n_iter=n_iter,
                              higher_score_is_better=True).fit(
                                  DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = best_model.transform(DATA_INPUTS)
    time_b = time.time()
    pipeline.flush_all_cache()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)