def test_pipeline_setup_incrementally(): class SomeStepThatFits(NonTransformableMixin, BaseStep): def __init__(self): BaseStep.__init__(self) self.has_fitted = False def fit(self, data_inputs, expected_outputs=None) -> _FittableStep: self.has_fitted = True return self class StepWithSensitiveSetup(Identity): """ Asserts that step given in argument has fitted before performing setup""" def __init__(self): Identity.__init__(self) def setup(self, context: ExecutionContext = None) -> BaseTransformer: assert some_step.has_fitted is True assert some_step2.has_fitted is False return self some_step = SomeStepThatFits() some_step2 = SomeStepThatFits() p = Pipeline([some_step, StepWithSensitiveSetup(), some_step2]) p.fit_transform(None, None)
def test_expectedoutputnull_is_fine_when_null(tmpdir): data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = None p = Pipeline([SomeStep()]) p.fit_transform(data_inputs,expected_outputs)
def test_expectedoutputnull_raise_exception_when_notnull(tmpdir): data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 2 p = Pipeline([AssertExpectedOutputIsNone()]) with pytest.raises(AssertionError) as error_info: p.fit_transform(data_inputs, expected_outputs)
def test_forcehandleidentity_does_not_crash(tmpdir): p = Pipeline([ ForceHandleIdentity() ]) data_inputs = np.array([0, 1, 2, 3]) expected_outputs = data_inputs * 2 p.fit(data_inputs, expected_outputs) p.fit_transform(data_inputs, expected_outputs) p.transform(data_inputs=data_inputs)
def test_load_full_dump_from_path(tmpdir): # Given tape_fit_callback_function = TapeCallbackFunction() tape_transform_callback_function = TapeCallbackFunction() pipeline = Pipeline( [('step_a', Identity()), ('step_b', OutputTransformerWrapper( FitTransformCallbackStep(tape_fit_callback_function, tape_transform_callback_function)))], cache_folder=tmpdir).set_name(PIPELINE_NAME) # When pipeline, outputs = pipeline.fit_transform(DATA_INPUTS, EXPECTED_OUTPUTS) pipeline.save(ExecutionContext(tmpdir), full_dump=True) # Then loaded_pipeline = ExecutionContext(tmpdir).load( os.path.join(PIPELINE_NAME, 'step_b')) assert isinstance(loaded_pipeline, OutputTransformerWrapper) loaded_step_b_wrapped_step = loaded_pipeline.wrapped assert np.array_equal( loaded_step_b_wrapped_step.transform_callback_function.data[0], EXPECTED_OUTPUTS) assert np.array_equal( loaded_step_b_wrapped_step.fit_callback_function.data[0][0], EXPECTED_OUTPUTS) assert np.array_equal( loaded_step_b_wrapped_step.fit_callback_function.data[0][1], [None] * len(EXPECTED_OUTPUTS))
def test_pipeline_nested_mutate_inverse_transform(): expected_tape = ["1", "2", "3", "4", "5", "6", "7", "7", "6", "5", "4", "3", "2", "1"] tape = TapeCallbackFunction() p = Pipeline([ Identity(), TransformCallbackStep(tape.callback, ["1"]), TransformCallbackStep(tape.callback, ["2"]), Pipeline([ Identity(), TransformCallbackStep(tape.callback, ["3"]), TransformCallbackStep(tape.callback, ["4"]), TransformCallbackStep(tape.callback, ["5"]), Identity() ]), TransformCallbackStep(tape.callback, ["6"]), TransformCallbackStep(tape.callback, ["7"]), Identity() ]) p, _ = p.fit_transform(np.ones((1, 1))) # will add range(1, 8) to tape. print("[mutating]") p = p.mutate(new_method="inverse_transform", method_to_assign_to="transform") p.transform(np.ones((1, 1))) # will add reversed(range(1, 8)) to tape. print(expected_tape) print(tape.get_name_tape()) assert expected_tape == tape.get_name_tape()
def test_pipeline_nested_mutate_inverse_transform_without_identities(): """ This test was required for a strange bug at the border of the pipelines that happened when the identities were not used. """ expected_tape = ["1", "2", "3", "4", "5", "6", "7", "7", "6", "5", "4", "3", "2", "1"] tape = TapeCallbackFunction() p = Pipeline([ TransformCallbackStep(tape.callback, ["1"]), TransformCallbackStep(tape.callback, ["2"]), Pipeline([ TransformCallbackStep(tape.callback, ["3"]), TransformCallbackStep(tape.callback, ["4"]), TransformCallbackStep(tape.callback, ["5"]), ]), TransformCallbackStep(tape.callback, ["6"]), TransformCallbackStep(tape.callback, ["7"]), ]) p, _ = p.fit_transform(np.ones((1, 1))) # will add range(1, 8) to tape. print("[mutating, inversing, and calling each inverse_transform]") reversed(p).transform(np.ones((1, 1))) # will add reversed(range(1, 8)) to tape, calling inverse_transforms. print(expected_tape) print(tape.get_name_tape()) assert expected_tape == tape.get_name_tape()
def test_model_stacking_fit_transform(): model_stacking = Pipeline([ ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) expected_outputs_shape = (379, 1) data_inputs_shape = (379, 13) data_inputs = _create_data(data_inputs_shape) expected_outputs = _create_data(expected_outputs_shape) model_stacking, outputs = model_stacking.fit_transform( data_inputs, expected_outputs) assert outputs.shape == expected_outputs_shape
def test_pipeline_fit_transform(steps_list, pipeline_runner): data_input_ = [AN_INPUT] expected_output_ = [AN_EXPECTED_OUTPUT] p = Pipeline(steps_list, pipeline_runner=pipeline_runner()) p, result = p.fit_transform(data_input_, expected_output_) assert tuple(result) == tuple(expected_output_)
def test_tape_callback(): expected_tape = ["1", "2", "3", "a", "b", "4"] tape = TapeCallbackFunction() p = Pipeline([ Identity(), TransformCallbackStep(tape.callback, ["1"]), TransformCallbackStep(tape.callback, ["2"]), TransformCallbackStep(tape.callback, ["3"]), AddFeatures([ TransformCallbackStep(tape.callback, ["a"]), TransformCallbackStep(tape.callback, ["b"]), ]), TransformCallbackStep(tape.callback, ["4"]), Identity() ]) p.fit_transform(np.ones((1, 1))) assert tape.get_name_tape() == expected_tape
def test_tensorflowv1_saver(tmpdir): data_inputs = np.array([ 3.3, 4.4, 5.5, 6.71, 6.93, 4.168, 9.779, 6.182, 7.59, 2.167, 7.042, 10.791, 5.313, 7.997, 5.654, 9.27, 3.1 ]) expected_ouptuts = np.array([ 1.7, 2.76, 2.09, 3.19, 1.694, 1.573, 3.366, 2.596, 2.53, 1.221, 2.827, 3.465, 1.65, 2.904, 2.42, 2.94, 1.3 ]) model = Pipeline([create_model_step()]) for i in range(50): model, outputs = model.fit_transform(data_inputs, expected_ouptuts) model.save(ExecutionContext(root=tmpdir)) model = Pipeline([create_model_step()]).load(ExecutionContext(root=tmpdir)) model, outputs = model.fit_transform(data_inputs, expected_ouptuts) assert ((outputs - expected_ouptuts)**2).mean() < 0.25
def test_fit_transform(steps: List[BaseStep], expected_tape: List[str]): tape.data = [] tape.name_tape = [] pipeline = Pipeline(steps=steps) actual_pipeline, actual_data_inputs = pipeline.fit_transform( data_inputs, expected_outputs) actual_tape = tape.get_name_tape() assert isinstance(actual_pipeline, Pipeline) assert actual_tape == expected_tape assert np.array_equal(actual_data_inputs, data_inputs)
def test_feature_union_should_fit_transform_with_zip_features(): p = Pipeline( [FeatureUnion([ Identity(), Identity(), ], joiner=ZipFeatures())]) data_inputs = np.random.randint(low=0, high=100, size=(2, 20)) expected_outputs = None p, outputs = p.fit_transform(data_inputs, expected_outputs) assert np.array_equal(outputs, np.stack([data_inputs, data_inputs], axis=1))
def test_feature_union_should_fit_transform_with_numpy_transpose(): p = Pipeline( [FeatureUnion([ Identity(), Identity(), ], joiner=NumpyTranspose())]) data_inputs = np.random.randint((1, 20)) expected_outputs = np.random.randint((1, 20)) p, outputs = p.fit_transform(data_inputs, expected_outputs) assert np.array_equal(outputs, np.array([data_inputs, data_inputs]).transpose())
def test_feature_union_should_fit_transform_with_concatenate_inner_features(): p = Pipeline([ FeatureUnion([ Identity(), Identity(), ], joiner=NumpyConcatenateInnerFeatures()) ]) data_inputs = np.random.randint((1, 20)) expected_outputs = np.random.randint((1, 20)) p, outputs = p.fit_transform(data_inputs, expected_outputs) assert np.array_equal(outputs, np.concatenate([data_inputs, data_inputs]))
def test_fit_transform_should_fit_then_use_cache(tmpdir): tape_transform = TapeCallbackFunction() tape_fit = TapeCallbackFunction() p = Pipeline([ JoblibValueCachingWrapper( LogFitTransformCallbackStep(tape_transform, tape_fit, transform_function=np.log), tmpdir) ]) p, outputs = p.fit_transform([1, 1, 2, 2], [2, 2, 4, 4]) assert outputs == EXPECTED_OUTPUTS assert tape_transform.data == [[1], [2]] assert tape_fit.data == [([1, 1, 2, 2], [2, 2, 4, 4])]
def test_should_flush_cache_on_every_fit(tmpdir): tape_transform = TapeCallbackFunction() tape_fit = TapeCallbackFunction() wrapper = JoblibValueCachingWrapper(LogFitTransformCallbackStep( tape_transform, tape_fit, transform_function=np.log), cache_folder=tmpdir) p = Pipeline([wrapper]) wrapper.create_checkpoint_path() wrapper.write_cache(1, 10) wrapper.write_cache(2, 20) p, outputs = p.fit_transform([1, 1, 2, 2], [2, 2, 4, 4]) assert outputs == EXPECTED_OUTPUTS assert tape_transform.data == [[1], [2]] assert tape_fit.data == [([1, 1, 2, 2], [2, 2, 4, 4])]
def test_data_shuffling_should_shuffle_data_inputs_and_expected_outputs(): callback_fit = TapeCallbackFunction() callback_transform = TapeCallbackFunction() data_shuffler = Pipeline([ DataShuffler(seed=42, increment_seed_after_each_fit=True), FitTransformCallbackStep(callback_transform, callback_fit) ]) data_inputs = np.array(range(10)) expected_outputs = np.array(range(10, 20)) outputs = data_shuffler.fit_transform(data_inputs, expected_outputs) assert not np.array_equal(outputs, data_inputs) assert not np.array_equal(callback_fit.data[0][0], data_inputs) assert not np.array_equal(callback_fit.data[0][1], expected_outputs) assert not np.array_equal(callback_transform.data, data_inputs)
def test_fit_transform_should_fit_transform_all_steps_for_each_data_inputs_expected_outputs(): tape = TapeCallbackFunction() tape_fit = TapeCallbackFunction() p = Pipeline([ ForEachDataInput(Pipeline([ FitTransformCallbackStep(tape.callback, tape_fit, ["1"]), FitTransformCallbackStep(tape.callback, tape_fit, ["2"]), ])) ]) data_inputs = [[0, 1], [1, 2]] expected_outputs = [[2, 3], [4, 5]] p, outputs = p.fit_transform(data_inputs, expected_outputs) assert tape.get_name_tape() == ["1", "2", "1", "2"] assert tape_fit.get_name_tape() == ["1", "2", "1", "2"] assert tape_fit.data == [([0, 1], [2, 3]), ([0, 1], [2, 3]), ([1, 2], [4, 5]), ([1, 2], [4, 5])]
def test_should_save_checkpoint_pickle(tmpdir: LocalPath): tape = TapeCallbackFunction() pickle_checkpoint_step = PickleCheckpointStep('1', tmpdir) pipeline = Pipeline(steps=[ TransformCallbackStep(tape.callback, ["1"]), pickle_checkpoint_step, TransformCallbackStep(tape.callback, ["2"]), TransformCallbackStep(tape.callback, ["3"]) ]) pipeline, actual_data_inputs = pipeline.fit_transform( data_inputs, expected_outputs) actual_tape = tape.get_name_tape() assert actual_data_inputs == data_inputs assert actual_tape == ["1", "2", "3"] assert os.path.exists( pickle_checkpoint_step.get_checkpoint_file_path(data_inputs))
def main(): np.random.seed(42) X = np.random.randint(5, size=(100, 5)) # Create and fit the pipeline: pipeline = Pipeline([ StandardScaler(), Identity(), Pipeline([ Identity(), Identity(), # Note: an Identity step is a step that does nothing. Identity(), # We use it here for demonstration purposes. Pipeline([Identity(), PCA(n_components=2)]) ]) ]) pipeline, X_t = pipeline.fit_transform(X) # Get the components: pca_components = pipeline["Pipeline"]["Pipeline"][ -1].get_wrapped_sklearn_predictor().components_ assert pca_components.shape == (2, 5)
def test_pipeline_simple_mutate_inverse_transform(): expected_tape = ["1", "2", "3", "4", "4", "3", "2", "1"] tape = TapeCallbackFunction() p = Pipeline([ Identity(), TransformCallbackStep(tape.callback, ["1"]), TransformCallbackStep(tape.callback, ["2"]), TransformCallbackStep(tape.callback, ["3"]), TransformCallbackStep(tape.callback, ["4"]), Identity() ]) p, _ = p.fit_transform(np.ones((1, 1))) print("[mutating]") p = p.mutate(new_method="inverse_transform", method_to_assign_to="transform") p.transform(np.ones((1, 1))) assert expected_tape == tape.get_name_tape()
def test_should_load_checkpoint_pickle(tmpdir: LocalPath): tape = TapeCallbackFunction() force_checkpoint_name = 'checkpoint_a' pickle_checkpoint_step = PickleCheckpointStep( force_checkpoint_name=force_checkpoint_name, cache_folder=tmpdir) pickle_checkpoint_step.set_checkpoint_path(force_checkpoint_name) with open(pickle_checkpoint_step.get_checkpoint_file_path(data_inputs), 'wb') as file: pickle.dump(data_inputs, file) pipeline = Pipeline( steps=[('a', TransformCallbackStep(tape.callback, ["1"]) ), ('b', TransformCallbackStep(tape.callback, ["2"]) ), (force_checkpoint_name, pickle_checkpoint_step ), ('c', TransformCallbackStep(tape.callback, ["3"]))]) pipeline, actual_data_inputs = pipeline.fit_transform( data_inputs, expected_outputs) actual_tape = tape.get_name_tape() assert actual_data_inputs == data_inputs assert actual_tape == ["3"]
def test_expand_dim_fit_transform(): handle_fit_callback = TapeCallbackFunction() handle_transform_callback = TapeCallbackFunction() handle_fit_transform_callback = TapeCallbackFunction() p = Pipeline([ ExpandDim( HandleCallbackStep(handle_fit_callback, handle_transform_callback, handle_fit_transform_callback)) ]) p['ExpandDim'].hashers = [SomeSummaryHasher(fake_summary_id=SUMMARY_ID)] p, outputs = p.fit_transform(np.array(range(10)), np.array(range(10))) assert np.array_equal(outputs, np.array(range(10))) assert handle_transform_callback.data == [] assert handle_fit_callback.data == [] assert handle_fit_transform_callback.data[0][0].current_ids == [SUMMARY_ID] assert handle_fit_transform_callback.data[0][0].summary_id == SUMMARY_ID assert np.array_equal( np.array(handle_fit_transform_callback.data[0][0].data_inputs), np.array([np.array(range(10))])) assert np.array_equal( np.array(handle_fit_transform_callback.data[0][0].expected_outputs), np.array([np.array(range(10))]))
def test_expand_dim_fit_transform(): handle_fit_callback = TapeCallbackFunction() handle_transform_callback = TapeCallbackFunction() handle_fit_transform_callback = TapeCallbackFunction() p = Pipeline([ ExpandDim( HandleCallbackStep(handle_fit_callback, handle_transform_callback, handle_fit_transform_callback)) ]) p, outputs = p.fit_transform(np.array(range(10)), np.array(range(10))) assert np.array_equal(outputs, np.array(range(10))) assert handle_transform_callback.data == [] assert handle_fit_callback.data == [] assert handle_fit_transform_callback.data[0][0].current_ids == [ '781e5e245d69b566979b86e28d23f2c7' ] assert np.array_equal( np.array(handle_fit_transform_callback.data[0][0].data_inputs), np.array([np.array(range(10))])) assert np.array_equal( np.array(handle_fit_transform_callback.data[0][0].expected_outputs), np.array([np.array(range(10))]))
def main(chosen_device): exercice_number = 1 print('exercice {}\n=================='.format(exercice_number)) data_inputs, expected_outputs = generate_data( # See: https://github.com/guillaume-chevalier/seq2seq-signal-prediction/blob/master/datasets.py exercice_number=exercice_number, n_samples=None, window_size_past=None, window_size_future=None) print('data_inputs shape: {} => (n_samples, window_size_past, input_dim)'. format(data_inputs.shape)) print( 'expected_outputs shape: {} => (n_samples, window_size_future, output_dim)' .format(expected_outputs.shape)) sequence_length = data_inputs.shape[1] input_dim = data_inputs.shape[2] output_dim = expected_outputs.shape[2] batch_size = 100 epochs = 3 validation_size = 0.15 max_plotted_validation_predictions = 10 seq2seq_pipeline_hyperparams = HyperparameterSamples({ 'hidden_dim': 100, 'layers_stacked_count': 2, 'lambda_loss_amount': 0.0003, 'learning_rate': 0.006, 'window_size_future': sequence_length, 'output_dim': output_dim, 'input_dim': input_dim }) feature_0_metric = metric_3d_to_2d_wrapper(mean_squared_error) metrics = {'mse': feature_0_metric} signal_prediction_pipeline = Pipeline([ ForEachDataInput(MeanStdNormalizer()), ToNumpy(), PlotPredictionsWrapper( Tensorflow2ModelStep( # See: https://github.com/Neuraxio/Neuraxle-TensorFlow create_model=create_model, create_loss=create_loss, create_optimizer=create_optimizer, expected_outputs_dtype=tf.dtypes.float32, data_inputs_dtype=tf.dtypes.float32, print_loss=True).set_hyperparams(seq2seq_pipeline_hyperparams)) ]).set_name('SignalPrediction') pipeline = Pipeline([ EpochRepeater(ValidationSplitWrapper( MetricsWrapper(Pipeline([ TrainOnlyWrapper(DataShuffler()), MiniBatchSequentialPipeline([ MetricsWrapper(signal_prediction_pipeline, metrics=metrics, name='batch_metrics') ], batch_size=batch_size) ]), metrics=metrics, name='epoch_metrics', print_metrics=True), test_size=validation_size, scoring_function=feature_0_metric), epochs=epochs) ]) pipeline, outputs = pipeline.fit_transform(data_inputs, expected_outputs) plot_metrics(pipeline=pipeline, exercice_number=exercice_number) plot_predictions(data_inputs, expected_outputs, pipeline, max_plotted_validation_predictions)