def test_when_hyperparams_and_saved_no_pipeline_should_not_load_checkpoint_pickle( tmpdir: LocalPath): # Given tape = TapeCallbackFunction() pickle_checkpoint_step = DefaultCheckpoint() # When pipeline_save = create_pipeline(tmpdir=tmpdir, pickle_checkpoint_step=DefaultCheckpoint(), tape=TapeCallbackFunction(), hyperparameters=HyperparameterSamples( {"a__learning_rate": 1}), different=True, save_pipeline=False) pipeline_save.fit_transform(data_inputs, expected_outputs) pipeline_load = create_pipeline( tmpdir=tmpdir, pickle_checkpoint_step=pickle_checkpoint_step, tape=tape, hyperparameters=HyperparameterSamples({"a__learning_rate": 1})) pipeline_load, actual_data_inputs = pipeline_load.fit_transform( data_inputs, expected_outputs) # Then actual_tape = tape.get_name_tape() assert np.array_equal(actual_data_inputs, data_inputs) assert actual_tape == ["1", "2", "3"]
def test_when_hyperparams_and_saved_same_pipeline_should_load_checkpoint_pickle(tmpdir: LocalPath): # Given tape = TapeCallbackFunction() # When pipeline_save = create_pipeline( tmpdir=tmpdir, pickle_checkpoint_step=DefaultCheckpoint(), tape=TapeCallbackFunction(), hyperparameters=HyperparameterSamples({"a__learning_rate": 1}) ) pipeline_save.fit_transform(data_inputs, expected_outputs) pipeline_load = create_pipeline( tmpdir=tmpdir, pickle_checkpoint_step=DefaultCheckpoint(), tape=tape, hyperparameters=HyperparameterSamples({"a__learning_rate": 1}) ) pipeline_load, actual_data_inputs = pipeline_load.fit_transform(data_inputs, expected_outputs) # Then actual_tape = tape.get_name_tape() assert np.array_equal(actual_data_inputs, data_inputs) assert actual_tape == EXPECTED_TAPE_AFTER_CHECKPOINT
def test_pickle_checkpoint_step_should_load_data_container(tmpdir: LocalPath): initial_data_inputs = [1, 2] initial_expected_outputs = [2, 3] create_pipeline_output_transformer = lambda: ResumablePipeline([ ('output_transformer_1', MultiplyBy2OutputTransformer()), ('pickle_checkpoint', DefaultCheckpoint()), ('output_transformer_2', MultiplyBy2OutputTransformer()), ], cache_folder =tmpdir) create_pipeline_output_transformer().fit_transform( data_inputs=initial_data_inputs, expected_outputs=initial_expected_outputs) transformer = create_pipeline_output_transformer() actual_data_container = transformer.handle_transform( DataContainer(current_ids=[0, 1], data_inputs=initial_data_inputs, expected_outputs=initial_expected_outputs), ExecutionContext.create_from_root(transformer, ExecutionMode.TRANSFORM, tmpdir)) assert np.array_equal(actual_data_container.data_inputs, [4, 8]) assert np.array_equal(actual_data_container.expected_outputs, [8, 12])
def test_when_hyperparams_should_save_checkpoint_pickle(tmpdir: LocalPath): tape = TapeCallbackFunction() pickle_checkpoint_step = DefaultCheckpoint() pipeline = create_pipeline(tmpdir, pickle_checkpoint_step, tape, HyperparameterSamples({"a__learning_rate": 1})) pipeline, actual_data_inputs = pipeline.fit_transform( data_inputs, expected_outputs) actual_tape = tape.get_name_tape() assert np.array_equal(actual_data_inputs, data_inputs) assert actual_tape == ["1", "2", "3"] assert os.path.exists( os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'di', '44f9d6dd8b6ccae571ca04525c3eaffa.pickle')) assert os.path.exists( os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'di', '898a67b2f5eeae6393ca4b3162ba8e3d.pickle')) assert os.path.exists( os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'eo', '44f9d6dd8b6ccae571ca04525c3eaffa.pickle')) assert os.path.exists( os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'eo', '898a67b2f5eeae6393ca4b3162ba8e3d.pickle'))
def create_pipeline_output_transformer(tmpdir): return ResumablePipeline([ ('output_transformer_1', MultiplyBy2OutputTransformer()), ('joblib_checkpoint', DefaultCheckpoint()), ('output_transformer_2', MultiplyBy2OutputTransformer()), ], cache_folder=tmpdir)
def test_resumable_pipeline_fit_transform_should_save_all_fitted_pipeline_steps( tmpdir: LocalPath): p = ResumablePipeline( [(SOME_STEP_1, MultiplyByN(multiply_by=2)), (PIPELINE_2, ResumablePipeline([(SOME_STEP_2, MultiplyByN(multiply_by=4)), (CHECKPOINT, DefaultCheckpoint()), (SOME_STEP_3, MultiplyByN(multiply_by=6))]))], cache_folder=tmpdir) p.name = ROOT p, outputs = p.fit_transform(np.array(range(10)), np.array(range(10))) not_saved_paths = [create_some_step3_path(tmpdir)] saved_paths = [ create_root_path(tmpdir), create_pipeline2_path(tmpdir), create_some_step1_path(tmpdir), create_some_step2_path(tmpdir), create_some_checkpoint_path(tmpdir) ] assert np.array_equal(outputs, EXPECTED_OUTPUTS) for p in saved_paths: assert os.path.exists(p) for p in not_saved_paths: assert not os.path.exists(p)
def given_saved_pipeline(tmpdir: LocalPath): step_savers = [(SOME_STEP_1, []), (PIPELINE_2, [TruncableJoblibStepSaver()])] path = create_root_path(tmpdir, True) root = ResumablePipeline([], cache_folder=tmpdir) root.sub_steps_savers = step_savers root.name = ROOT dump(root, path) pipeline_2 = ResumablePipeline([], cache_folder=tmpdir) pipeline_2.name = 'pipeline2' pipeline_2.sub_steps_savers = [ (SOME_STEP_2, []), (CHECKPOINT, []), (SOME_STEP_3, []), ] dump(pipeline_2, create_pipeline2_path(tmpdir, True)) given_saved_some_step(multiply_by=2, name=SOME_STEP_1, path=create_some_step1_path(tmpdir, True)) given_saved_some_step(multiply_by=4, name=SOME_STEP_2, path=create_some_step2_path(tmpdir, True)) given_saved_some_step(multiply_by=6, name=SOME_STEP_3, path=create_some_step3_path(tmpdir, True)) checkpoint = DefaultCheckpoint() checkpoint.name = CHECKPOINT dump(checkpoint, create_some_checkpoint_path(tmpdir, True)) p = ResumablePipeline( [(SOME_STEP_1, MultiplyByN(multiply_by=1)), (PIPELINE_2, ResumablePipeline([(SOME_STEP_2, MultiplyByN(multiply_by=1)), (CHECKPOINT, DefaultCheckpoint()), (SOME_STEP_3, MultiplyByN(multiply_by=1))]))], cache_folder=tmpdir) p.name = ROOT return p
def create_checkpoint_test_case(tmpdir): tape_transform_1 = TapeCallbackFunction() tape_fit_1 = TapeCallbackFunction() tape_transform_2 = TapeCallbackFunction() tape_fit_2 = TapeCallbackFunction() pipeline = ResumablePipeline( [('step1', FitTransformCallbackStep(tape_transform_1, tape_fit_1)), ('checkpoint', DefaultCheckpoint()), ('step2', FitTransformCallbackStep(tape_transform_2, tape_fit_2))], cache_folder=tmpdir) return CheckpointTest(tape_transform_1, tape_fit_1, tape_transform_2, tape_fit_2, pipeline)
def test_when_no_hyperparams_should_save_checkpoint_pickle(tmpdir: LocalPath): tape = TapeCallbackFunction() pickle_checkpoint_step = DefaultCheckpoint() pipeline = create_pipeline(tmpdir, pickle_checkpoint_step, tape) pipeline, actual_data_inputs = pipeline.fit_transform(data_inputs, expected_outputs) actual_tape = tape.get_name_tape() assert np.array_equal(actual_data_inputs, data_inputs) assert actual_tape == ["1", "2", "3"] assert os.path.exists(os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'di', '0.pickle')) assert os.path.exists(os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'di', '1.pickle')) assert os.path.exists(os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'eo', '0.pickle')) assert os.path.exists(os.path.join(tmpdir, 'ResumablePipeline', 'pickle_checkpoint', 'eo', '1.pickle'))
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') classic_pipeline_folder = os.path.join(str(tmpdir), 'classic') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=classic_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable') pipeline = ResumablePipeline([ ('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN()) ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=resumable_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ] ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)
def main(tmpdir, sleep_time: float = 0, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ]).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() best_model = RandomSearch(pipeline, n_iter=n_iter, higher_score_is_better=True).fit( DATA_INPUTS, EXPECTED_OUTPUTS) outputs = best_model.transform(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') pipeline = ResumablePipeline( [('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN())], cache_folder=tmpdir).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() best_model = RandomSearch(pipeline, n_iter=n_iter, higher_score_is_better=True).fit( DATA_INPUTS, EXPECTED_OUTPUTS) outputs = best_model.transform(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)