コード例 #1
0
def create_pipeline(tmpdir,
                    pickle_checkpoint_step,
                    tape,
                    hyperparameters=None,
                    different=False,
                    save_pipeline=True):
    if different:
        pipeline = ResumablePipeline(steps=[
            ('a',
             DifferentCallbackStep(tape.callback, ["1"],
                                   hyperparams=hyperparameters)),
            ('pickle_checkpoint', pickle_checkpoint_step),
            ('c', TransformCallbackStep(tape.callback, ["2"])),
            ('d', TransformCallbackStep(tape.callback, ["3"]))
        ],
                                     cache_folder=tmpdir)
    else:
        pipeline = ResumablePipeline(steps=[
            ('a',
             TransformCallbackStep(tape.callback, ["1"],
                                   hyperparams=hyperparameters)),
            ('pickle_checkpoint', pickle_checkpoint_step),
            ('c', TransformCallbackStep(tape.callback, ["2"])),
            ('d', TransformCallbackStep(tape.callback, ["3"]))
        ],
                                     cache_folder=tmpdir)
    return pipeline
コード例 #2
0
def test_resumable_pipeline_fit_transform_should_save_all_fitted_pipeline_steps(
        tmpdir: LocalPath):
    p = ResumablePipeline(
        [(SOME_STEP_1, MultiplyByN(multiply_by=2)),
         (PIPELINE_2,
          ResumablePipeline([(SOME_STEP_2, MultiplyByN(multiply_by=4)),
                             (CHECKPOINT, DefaultCheckpoint()),
                             (SOME_STEP_3, MultiplyByN(multiply_by=6))]))],
        cache_folder=tmpdir)
    p.name = ROOT

    p, outputs = p.fit_transform(np.array(range(10)), np.array(range(10)))

    not_saved_paths = [create_some_step3_path(tmpdir)]
    saved_paths = [
        create_root_path(tmpdir),
        create_pipeline2_path(tmpdir),
        create_some_step1_path(tmpdir),
        create_some_step2_path(tmpdir),
        create_some_checkpoint_path(tmpdir)
    ]
    assert np.array_equal(outputs, EXPECTED_OUTPUTS)
    for p in saved_paths:
        assert os.path.exists(p)
    for p in not_saved_paths:
        assert not os.path.exists(p)
コード例 #3
0
def test_should_transform_each_steps(test_case: ResumablePipelineTestCase,
                                     tmpdir):
    pipeline = ResumablePipeline(steps=test_case.steps, cache_folder=tmpdir)

    actual_data_inputs = pipeline.transform(test_case.data_inputs)

    actual_tape = test_case.tape.get_name_tape()
    assert actual_tape == test_case.expected_tape
    assert np.array_equal(actual_data_inputs, test_case.data_inputs)
コード例 #4
0
def test_should_fit_each_steps(test_case: ResumablePipelineTestCase, tmpdir):
    pipeline = ResumablePipeline(steps=test_case.steps, cache_folder=tmpdir)

    actual_pipeline = pipeline.fit(test_case.data_inputs,
                                   test_case.expected_outputs)

    actual_tape = test_case.tape.get_name_tape()
    assert isinstance(actual_pipeline, Pipeline)
    assert actual_tape == test_case.expected_tape[:-1]
コード例 #5
0
def test_pickle_checkpoint_step_should_load_data_container(tmpdir: LocalPath):
    initial_data_inputs = [1, 2]
    initial_expected_outputs = [2, 3]

    create_pipeline_output_transformer = lambda: ResumablePipeline([
        ('output_transformer_1', MultiplyBy2OutputTransformer()),
        ('pickle_checkpoint', DefaultCheckpoint()),
        ('output_transformer_2', MultiplyBy2OutputTransformer()),
    ],
                                                                   cache_folder
                                                                   =tmpdir)

    create_pipeline_output_transformer().fit_transform(
        data_inputs=initial_data_inputs,
        expected_outputs=initial_expected_outputs)
    transformer = create_pipeline_output_transformer()
    actual_data_container = transformer.handle_transform(
        DataContainer(current_ids=[0, 1],
                      data_inputs=initial_data_inputs,
                      expected_outputs=initial_expected_outputs),
        ExecutionContext.create_from_root(transformer, ExecutionMode.TRANSFORM,
                                          tmpdir))

    assert np.array_equal(actual_data_container.data_inputs, [4, 8])
    assert np.array_equal(actual_data_container.expected_outputs, [8, 12])
コード例 #6
0
def create_pipeline_output_transformer(tmpdir):
    return ResumablePipeline([
        ('output_transformer_1', MultiplyBy2OutputTransformer()),
        ('joblib_checkpoint', DefaultCheckpoint()),
        ('output_transformer_2', MultiplyBy2OutputTransformer()),
    ],
                             cache_folder=tmpdir)
コード例 #7
0
ファイル: test_checkpoints.py プロジェクト: ipmeme/Neuraxle
def create_checkpoint_test_case(tmpdir):
    tape_transform_1 = TapeCallbackFunction()
    tape_fit_1 = TapeCallbackFunction()
    tape_transform_2 = TapeCallbackFunction()
    tape_fit_2 = TapeCallbackFunction()
    pipeline = ResumablePipeline(
        [('step1', FitTransformCallbackStep(tape_transform_1, tape_fit_1)),
         ('checkpoint', DefaultCheckpoint()),
         ('step2', FitTransformCallbackStep(tape_transform_2, tape_fit_2))],
        cache_folder=tmpdir)

    return CheckpointTest(tape_transform_1, tape_fit_1, tape_transform_2,
                          tape_fit_2, pipeline)
コード例 #8
0
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10):
    DATA_INPUTS = np.array(range(100))
    EXPECTED_OUTPUTS = np.array(range(100, 200))

    HYPERPARAMETER_SPACE = HyperparameterSpace({
        'multiplication_1__multiply_by': RandInt(1, 2),
        'multiplication_2__multiply_by': RandInt(1, 2),
        'multiplication_3__multiply_by': RandInt(1, 2),
    })

    print('Classic Pipeline:')
    classic_pipeline_folder = os.path.join(str(tmpdir), 'classic')

    pipeline = Pipeline([
        ('multiplication_1', MultiplyByN()),
        ('sleep_1', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_3', MultiplyByN()),
    ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    auto_ml = AutoML(
        pipeline,
        refit_trial=True,
        n_trials=n_iter,
        cache_folder_when_no_handle=classic_pipeline_folder,
        validation_splitter=ValidationSplitter(0.2),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)
        ],
    )
    auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = auto_ml.get_best_model().predict(DATA_INPUTS)
    time_b = time.time()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)

    print('Resumable Pipeline:')
    resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable')

    pipeline = ResumablePipeline([
        ('multiplication_1', MultiplyByN()),
        ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))),
        ('checkpoint1', ExpandDim(DefaultCheckpoint())),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('checkpoint2', ExpandDim(DefaultCheckpoint())),
        ('multiplication_3', MultiplyByN())
    ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    auto_ml = AutoML(
        pipeline,
        refit_trial=True,
        n_trials=n_iter,
        cache_folder_when_no_handle=resumable_pipeline_folder,
        validation_splitter=ValidationSplitter(0.2),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)
        ]
    )
    auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = auto_ml.get_best_model().predict(DATA_INPUTS)
    time_b = time.time()
    pipeline.flush_all_cache()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)
コード例 #9
0
def main(tmpdir, sleep_time: float = 0, n_iter: int = 10):
    DATA_INPUTS = np.array(range(100))
    EXPECTED_OUTPUTS = np.array(range(100, 200))

    HYPERPARAMETER_SPACE = HyperparameterSpace({
        'multiplication_1__multiply_by':
        RandInt(1, 2),
        'multiplication_2__multiply_by':
        RandInt(1, 2),
        'multiplication_3__multiply_by':
        RandInt(1, 2),
    })

    print('Classic Pipeline:')

    pipeline = Pipeline([
        ('multiplication_1', MultiplyByN()),
        ('sleep_1', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_3', MultiplyByN()),
    ]).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    best_model = RandomSearch(pipeline,
                              n_iter=n_iter,
                              higher_score_is_better=True).fit(
                                  DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = best_model.transform(DATA_INPUTS)
    time_b = time.time()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)

    print('Resumable Pipeline:')

    pipeline = ResumablePipeline(
        [('multiplication_1', MultiplyByN()),
         ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))),
         ('checkpoint1', ExpandDim(DefaultCheckpoint())),
         ('multiplication_2', MultiplyByN()),
         ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
         ('checkpoint2', ExpandDim(DefaultCheckpoint())),
         ('multiplication_3', MultiplyByN())],
        cache_folder=tmpdir).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    best_model = RandomSearch(pipeline,
                              n_iter=n_iter,
                              higher_score_is_better=True).fit(
                                  DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = best_model.transform(DATA_INPUTS)
    time_b = time.time()
    pipeline.flush_all_cache()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)
コード例 #10
0
def create_test_cases():
    data_inputs = np.ones((1, 1))
    expected_outputs = np.ones((1, 1))
    dc = DataContainer(data_inputs=data_inputs,
                       current_ids=range(len(data_inputs)),
                       expected_outputs=expected_outputs)

    tape = TapeCallbackFunction()
    tape_fit = TapeCallbackFunction()
    tape_without_checkpoint_test_arguments = ResumablePipelineTestCase(
        tape, data_inputs, expected_outputs,
        [("a", FitTransformCallbackStep(tape.callback, tape_fit.callback,
                                        ["1"])),
         ("b", FitTransformCallbackStep(tape.callback, tape_fit.callback,
                                        ["2"])),
         ("c", FitTransformCallbackStep(tape.callback, tape_fit.callback,
                                        ["3"]))], ["1", "2", "3"])

    tape2 = TapeCallbackFunction()
    tape2_fit = TapeCallbackFunction()
    tape_checkpoint_not_saved_test_arguments = ResumablePipelineTestCase(
        tape2, data_inputs, expected_outputs,
        [("a",
          FitTransformCallbackStep(tape2.callback, tape2_fit.callback, ["1"])),
         ("b", SomeCheckpointStep(data_container=None)),
         ("c",
          FitTransformCallbackStep(tape2.callback, tape2_fit.callback, ["2"])),
         ("d",
          FitTransformCallbackStep(tape2.callback, tape2_fit.callback, ["3"]))
         ], ["1", "2", "3"])

    tape3 = TapeCallbackFunction()
    tape3_fit = TapeCallbackFunction()
    tape_checkpoint_saved_after_first_step_test_arguments = ResumablePipelineTestCase(
        tape3, data_inputs, expected_outputs,
        [("a",
          FitTransformCallbackStep(tape3.callback, tape3_fit.callback, ["1"])),
         ("b", SomeCheckpointStep(data_container=dc)),
         ("c",
          FitTransformCallbackStep(tape3.callback, tape3_fit.callback, ["2"])),
         ("d",
          FitTransformCallbackStep(tape3.callback, tape3_fit.callback, ["3"]))
         ], ["2", "3"])

    tape4 = TapeCallbackFunction()
    tape4_fit = TapeCallbackFunction()
    tape_checkpoint_saved_after_second_step_test_arguments = ResumablePipelineTestCase(
        tape4, data_inputs, expected_outputs,
        [("a",
          FitTransformCallbackStep(tape4.callback, tape4_fit.callback, ["1"])),
         ("b",
          FitTransformCallbackStep(tape4.callback, tape4_fit.callback, ["2"])),
         ("c", SomeCheckpointStep(data_container=dc)),
         ("d",
          FitTransformCallbackStep(tape4.callback, tape4_fit.callback, ["3"]))
         ], ["3"])

    tape5 = TapeCallbackFunction()
    tape5_fit = TapeCallbackFunction()
    tape_checkpoint_saved_after_last_step_test_arguments = ResumablePipelineTestCase(
        tape5, data_inputs, expected_outputs, [
            ("a",
             FitTransformCallbackStep(tape5.callback, tape5_fit.callback,
                                      ["1"])),
            ("b",
             FitTransformCallbackStep(tape5.callback, tape5_fit.callback,
                                      ["2"])),
            ("c",
             FitTransformCallbackStep(tape5.callback, tape5_fit.callback,
                                      ["3"])),
            ("d", SomeCheckpointStep(data_container=dc)),
        ], [])

    tape6 = TapeCallbackFunction()
    tape6_fit = TapeCallbackFunction()
    tape_checkpoint_saved_inside_subpipeline_last_step = ResumablePipelineTestCase(
        tape6, data_inputs, expected_outputs, [
            ("a",
             FitTransformCallbackStep(tape6.callback, tape6_fit.callback,
                                      ["1"])),
            ResumablePipeline([
                ("b",
                 FitTransformCallbackStep(tape6.callback, tape6_fit.callback,
                                          ["2"])),
                ("d", SomeCheckpointStep(data_container=dc)),
            ]),
            ("e",
             FitTransformCallbackStep(tape6.callback, tape6_fit.callback,
                                      ["3"])),
            ("f",
             FitTransformCallbackStep(tape6.callback, tape6_fit.callback,
                                      ["4"])),
        ], ["3", "4"])

    tape7 = TapeCallbackFunction()
    tape7_fit = TapeCallbackFunction()
    tape_checkpoint_saved_inside_subpipeline_first_step = ResumablePipelineTestCase(
        tape7, data_inputs, expected_outputs, [
            ("a",
             FitTransformCallbackStep(tape7.callback, tape7_fit.callback,
                                      ["1"])),
            ResumablePipeline([
                ("d", SomeCheckpointStep(data_container=dc)),
                ("b",
                 FitTransformCallbackStep(tape7.callback, tape7_fit.callback,
                                          ["2"])),
            ]),
            ("e",
             FitTransformCallbackStep(tape7.callback, tape7_fit.callback,
                                      ["3"])),
            ("f",
             FitTransformCallbackStep(tape7.callback, tape7_fit.callback,
                                      ["4"])),
        ], ["2", "3", "4"])

    tape8 = TapeCallbackFunction()
    tape8_fit = TapeCallbackFunction()
    tape_checkpoint_saved_inside_subpipeline_step_in_the_middle = ResumablePipelineTestCase(
        tape8, data_inputs, expected_outputs, [
            ("a",
             FitTransformCallbackStep(tape8.callback, tape8_fit.callback,
                                      ["1"])),
            ResumablePipeline([
                ("b",
                 FitTransformCallbackStep(tape8.callback, tape8_fit.callback,
                                          ["2"])),
                ("d", SomeCheckpointStep(data_container=dc)),
                ("e",
                 FitTransformCallbackStep(tape8.callback, tape8_fit.callback,
                                          ["3"])),
            ]),
            ("f",
             FitTransformCallbackStep(tape8.callback, tape8_fit.callback,
                                      ["4"])),
        ], ["3", "4"])

    tape9 = TapeCallbackFunction()
    tape9_fit = TapeCallbackFunction()
    tape_checkpoint_saved_inside_subpipeline_of_subpipeline = ResumablePipelineTestCase(
        tape9, data_inputs, expected_outputs, [
            ("a",
             FitTransformCallbackStep(tape9.callback, tape9_fit.callback,
                                      ["1"])),
            ResumablePipeline([
                ("b",
                 FitTransformCallbackStep(tape9.callback, tape9_fit.callback,
                                          ["2"])),
                ResumablePipeline([
                    ("e",
                     FitTransformCallbackStep(tape9.callback,
                                              tape9_fit.callback, ["3"])),
                    ("d", SomeCheckpointStep(data_container=dc)),
                    ("f",
                     FitTransformCallbackStep(tape9.callback,
                                              tape9_fit.callback, ["4"])),
                ]),
                ("g",
                 FitTransformCallbackStep(tape9.callback, tape9_fit.callback,
                                          ["5"])),
            ]),
            ("h",
             FitTransformCallbackStep(tape9.callback, tape9_fit.callback,
                                      ["6"])),
        ], ["4", "5", "6"])

    tape10 = TapeCallbackFunction()
    tape10_fit = TapeCallbackFunction()
    tape_saved_checkpoint_after_another_saved_checkpoint = ResumablePipelineTestCase(
        tape10, data_inputs, expected_outputs,
        [("a",
          FitTransformCallbackStep(tape10.callback, tape10_fit.callback,
                                   ["1"])),
         ("b", SomeCheckpointStep(data_container=dc)),
         ("c",
          FitTransformCallbackStep(tape10.callback, tape10_fit.callback,
                                   ["2"])),
         ("b", SomeCheckpointStep(data_container=dc)),
         ("d",
          FitTransformCallbackStep(tape10.callback, tape10_fit.callback,
                                   ["3"]))], ["3"])

    tape11 = TapeCallbackFunction()
    tape11_fit = TapeCallbackFunction()
    tape_multiple_checkpoint_in_a_row = ResumablePipelineTestCase(
        tape11, data_inputs, expected_outputs,
        [("a",
          FitTransformCallbackStep(tape11.callback, tape11_fit.callback,
                                   ["1"])),
         ("joblib_1", SomeCheckpointStep(data_container=dc)),
         ("joblib_2", SomeCheckpointStep(data_container=dc)),
         ("c",
          FitTransformCallbackStep(tape11.callback, tape11_fit.callback,
                                   ["2"])),
         ("d",
          FitTransformCallbackStep(tape11.callback, tape11_fit.callback,
                                   ["3"]))], ["2", "3"])

    return [
        tape_without_checkpoint_test_arguments,
        tape_checkpoint_not_saved_test_arguments,
        tape_checkpoint_saved_after_first_step_test_arguments,
        tape_checkpoint_saved_after_second_step_test_arguments,
        tape_checkpoint_saved_after_last_step_test_arguments,
        tape_checkpoint_saved_inside_subpipeline_first_step,
        tape_checkpoint_saved_inside_subpipeline_last_step,
        tape_checkpoint_saved_inside_subpipeline_step_in_the_middle,
        tape_checkpoint_saved_inside_subpipeline_of_subpipeline,
        tape_saved_checkpoint_after_another_saved_checkpoint,
        tape_multiple_checkpoint_in_a_row
    ]
コード例 #11
0
def given_saved_pipeline(tmpdir: LocalPath):
    step_savers = [(SOME_STEP_1, []),
                   (PIPELINE_2, [TruncableJoblibStepSaver()])]
    path = create_root_path(tmpdir, True)
    root = ResumablePipeline([], cache_folder=tmpdir)
    root.sub_steps_savers = step_savers
    root.name = ROOT
    dump(root, path)

    pipeline_2 = ResumablePipeline([], cache_folder=tmpdir)
    pipeline_2.name = 'pipeline2'
    pipeline_2.sub_steps_savers = [
        (SOME_STEP_2, []),
        (CHECKPOINT, []),
        (SOME_STEP_3, []),
    ]
    dump(pipeline_2, create_pipeline2_path(tmpdir, True))

    given_saved_some_step(multiply_by=2,
                          name=SOME_STEP_1,
                          path=create_some_step1_path(tmpdir, True))
    given_saved_some_step(multiply_by=4,
                          name=SOME_STEP_2,
                          path=create_some_step2_path(tmpdir, True))
    given_saved_some_step(multiply_by=6,
                          name=SOME_STEP_3,
                          path=create_some_step3_path(tmpdir, True))

    checkpoint = DefaultCheckpoint()
    checkpoint.name = CHECKPOINT
    dump(checkpoint, create_some_checkpoint_path(tmpdir, True))

    p = ResumablePipeline(
        [(SOME_STEP_1, MultiplyByN(multiply_by=1)),
         (PIPELINE_2,
          ResumablePipeline([(SOME_STEP_2, MultiplyByN(multiply_by=1)),
                             (CHECKPOINT, DefaultCheckpoint()),
                             (SOME_STEP_3, MultiplyByN(multiply_by=1))]))],
        cache_folder=tmpdir)
    p.name = ROOT

    return p