def __init__( self, steps: List[QueuedPipelineStepsTuple], batch_size, n_workers_per_step=None, max_queue_size=None, data_joiner=None, use_threading=False, use_savers=False, cache_folder=None ): NonFittableMixin.__init__(self) CustomPipelineMixin.__init__(self) if data_joiner is None: data_joiner = NumpyConcatenateOuterBatch() self.data_joiner = data_joiner self.max_queue_size = max_queue_size self.batch_size = batch_size self.n_workers_per_step = n_workers_per_step self.use_threading = use_threading self.use_savers = use_savers MiniBatchSequentialPipeline.__init__(self, steps=self._initialize_steps_as_tuple(steps), cache_folder=cache_folder) self._refresh_steps()
def main(): """ The task is to sleep 0.02 seconds for each data input and then multiply by 2. """ sleep_time = 0.02 preprocessing_and_model_steps = [ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)] # Classical pipeline - all at once with one big batch: p = Pipeline(preprocessing_and_model_steps) time_vanilla_pipeline, output_classical = eval_run_time(p) print(f"Classical 'Pipeline' execution time: {time_vanilla_pipeline} seconds.") # Classical minibatch pipeline - minibatch size 10: p = MiniBatchSequentialPipeline(preprocessing_and_model_steps, batch_size=10) time_minibatch_pipeline, output_minibatch = eval_run_time(p) print(f"Minibatched 'MiniBatchSequentialPipeline' execution time: {time_minibatch_pipeline} seconds.") # Parallel pipeline - minibatch size 10 with 16 parallel workers per step that # have a max queue size of 10 batches between preprocessing and the model: p = SequentialQueuedPipeline(preprocessing_and_model_steps, n_workers_per_step=16, max_queue_size=10, batch_size=10) time_parallel_pipeline, output_parallel = eval_run_time(p) print(f"Parallel 'SequentialQueuedPipeline' execution time: {time_parallel_pipeline} seconds.") assert time_parallel_pipeline < time_minibatch_pipeline, str((time_parallel_pipeline, time_vanilla_pipeline)) assert np.array_equal(output_classical, output_minibatch) assert np.array_equal(output_classical, output_parallel)
def __init__( self, steps: List[QueuedPipelineStepsTuple], batch_size: int, n_workers_per_step: int = None, max_queue_size: int = None, data_joiner=None, use_threading: bool = False, use_savers: bool = False, include_incomplete_batch: bool = False, default_value_data_inputs: Union[Any, AbsentValuesNullObject] = None, default_value_expected_outputs: Union[Any, AbsentValuesNullObject] = None, cache_folder: str = None, ): if data_joiner is None: data_joiner = NumpyConcatenateOuterBatch() self.data_joiner = data_joiner self.max_queue_size = max_queue_size self.batch_size = batch_size self.n_workers_per_step = n_workers_per_step self.use_threading = use_threading self.use_savers = use_savers self.batch_size: int = batch_size self.include_incomplete_batch: bool = include_incomplete_batch self.default_value_data_inputs: Union[ Any, AbsentValuesNullObject] = default_value_data_inputs self.default_value_expected_outputs: Union[ Any, AbsentValuesNullObject] = default_value_expected_outputs MiniBatchSequentialPipeline.__init__( self, steps=self._initialize_steps_as_tuple(steps), cache_folder=cache_folder, batch_size=batch_size, include_incomplete_batch=include_incomplete_batch, default_value_data_inputs=default_value_data_inputs, default_value_expected_outputs=default_value_expected_outputs) self._refresh_steps()
def test_mini_batch_sequential_pipeline_should_transform_steps_sequentially_for_each_barrier_for_each_batch( ): # Given tape1 = TapeCallbackFunction() tape2 = TapeCallbackFunction() tape3 = TapeCallbackFunction() tape4 = TapeCallbackFunction() p = MiniBatchSequentialPipeline([ MultiplyBy2TransformCallbackStep(tape1, ["1"]), MultiplyBy2TransformCallbackStep(tape2, ["2"]), Joiner(batch_size=10), MultiplyBy2TransformCallbackStep(tape3, ["3"]), MultiplyBy2TransformCallbackStep(tape4, ["4"]), Joiner(batch_size=10) ]) # When outputs = p.transform(list(range(20))) # Then assert outputs == [ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304 ] assert tape1.data == [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]] assert tape1.name_tape == ["1", "1"] assert tape2.data == [[0, 2, 4, 6, 8, 10, 12, 14, 16, 18], [20, 22, 24, 26, 28, 30, 32, 34, 36, 38]] assert tape2.name_tape == ["2", "2"] assert tape3.data == [[0, 4, 8, 12, 16, 20, 24, 28, 32, 36], [40, 44, 48, 52, 56, 60, 64, 68, 72, 76]] assert tape3.name_tape == ["3", "3"] assert tape4.data == [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72], [80, 88, 96, 104, 112, 120, 128, 136, 144, 152]] assert tape4.name_tape == ["4", "4"]
def _create_mini_batch_pipeline(self, wrapped: BaseStep) -> BaseStep: """ Add mini batching and batch metrics by wrapping the step with :class:`MetricsWrapper`, and :class:̀MiniBatchSequentialPipeline`. :param wrapped: pipeline step :type wrapped: BaseStep :return: wrapped pipeline step :rtype: MetricsWrapper """ if self.batch_size is not None: wrapped = MetricsWrapper(wrapped=wrapped, metrics=self.batch_metrics, name=BATCH_METRICS_STEP_NAME, print_metrics=self.print_batch_metrics) wrapped = MiniBatchSequentialPipeline( [wrapped], batch_size=self.batch_size ) return wrapped
def test_minibatch_sequential_pipeline_change_batch_size_works(): tape1 = TapeCallbackFunction() tape1_fit = TapeCallbackFunction() tape2 = TapeCallbackFunction() tape2_fit = TapeCallbackFunction() p = MiniBatchSequentialPipeline([ MultiplyBy2FitTransformCallbackStep(tape1, tape1_fit, ["1"]), Joiner(batch_size=10), MultiplyBy2FitTransformCallbackStep(tape2, tape2_fit, ["2"]), Joiner(batch_size=10) ]) # When p, outputs = p.fit_transform(list(range(20)), list(range(20))) p.set_batch_size(5) p, outputs = p.fit_transform(list(range(20, 30)), list(range(20, 30))) # Then assert tape1.data == [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20, 21, 22, 23, 24], [25, 26, 27, 28, 29]] assert tape1_fit.data == [([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ([10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), ([20, 21, 22, 23, 24], [20, 21, 22, 23, 24]), ([25, 26, 27, 28, 29], [25, 26, 27, 28, 29])] assert tape1.name_tape == ["1", "1", "1", "1"] assert tape2.data == [[0, 2, 4, 6, 8, 10, 12, 14, 16, 18], [20, 22, 24, 26, 28, 30, 32, 34, 36, 38], [40, 42, 44, 46, 48], [50, 52, 54, 56, 58]] assert tape2_fit.data == [([0, 2, 4, 6, 8, 10, 12, 14, 16, 18], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ([20, 22, 24, 26, 28, 30, 32, 34, 36, 38], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), ([40, 42, 44, 46, 48], [20, 21, 22, 23, 24]), ([50, 52, 54, 56, 58], [25, 26, 27, 28, 29])] assert tape2.name_tape == ["2", "2", "2", "2"]
def main(chosen_device): exercice_number = 1 print('exercice {}\n=================='.format(exercice_number)) data_inputs, expected_outputs = generate_data( # See: https://github.com/guillaume-chevalier/seq2seq-signal-prediction/blob/master/datasets.py exercice_number=exercice_number, n_samples=None, window_size_past=None, window_size_future=None) print('data_inputs shape: {} => (n_samples, window_size_past, input_dim)'. format(data_inputs.shape)) print( 'expected_outputs shape: {} => (n_samples, window_size_future, output_dim)' .format(expected_outputs.shape)) sequence_length = data_inputs.shape[1] input_dim = data_inputs.shape[2] output_dim = expected_outputs.shape[2] batch_size = 100 epochs = 3 validation_size = 0.15 max_plotted_validation_predictions = 10 seq2seq_pipeline_hyperparams = HyperparameterSamples({ 'hidden_dim': 100, 'layers_stacked_count': 2, 'lambda_loss_amount': 0.0003, 'learning_rate': 0.006, 'window_size_future': sequence_length, 'output_dim': output_dim, 'input_dim': input_dim }) pipeline = Pipeline([ MiniBatchSequentialPipeline( [ ForEachDataInput(MeanStdNormalizer()), ToNumpy(), PlotPredictionsWrapper( Tensorflow2ModelStep( # See: https://github.com/Neuraxio/Neuraxle-TensorFlow create_model=create_model, create_loss=create_loss, create_optimizer=create_optimizer, expected_outputs_dtype=tf.dtypes.float32, data_inputs_dtype=tf.dtypes.float32, device_name=chosen_device, print_loss=True).set_hyperparams( seq2seq_pipeline_hyperparams)) ], batch_size=batch_size), ]).set_name('SignalPrediction') trainer = Trainer( epochs=epochs, validation_splitter=ValidationSplitter(test_size=validation_size), scoring_callback=ScoringCallback( metric_function=metric_3d_to_2d_wrapper(mean_squared_error), higher_score_is_better=False)) trial: Trial = trainer.train(pipeline=pipeline, data_inputs=data_inputs, expected_outputs=expected_outputs) plot_metrics( metric_name='mse', train_values=trial.validation_splits[0].metrics_results['main'] ['train_values'], validation_values=trial.validation_splits[0].metrics_results['main'] ['validation_values'], exercice_number=exercice_number) # Get trained pipeline pipeline = trial.get_trained_pipeline(split_number=0) # Get validation set with trainer.validation_split_function.split function. _, _, data_inputs_validation, expected_outputs_validation = trainer.validation_split_function.split( data_inputs=data_inputs, expected_outputs=expected_outputs) # Enable the plotting feature inside the PlotPredictionsWrapper wrapper step. pipeline.apply('toggle_plotting') pipeline.apply(method='set_max_plotted_predictions', max_plotted_predictions=max_plotted_validation_predictions) # Transform the trained pipeline to plot predictions pipeline.transform_data_container( DataContainer(data_inputs=data_inputs_validation[0], expected_outputs=expected_outputs_validation[0]))
def main(): def accuracy(data_inputs, expected_outputs): return np.mean( np.argmax(np.array(data_inputs), axis=1) == np.argmax( np.array(expected_outputs), axis=1)) # load the dataset df = read_csv('data/winequality-white.csv', sep=';') data_inputs = df.values data_inputs[:, -1] = data_inputs[:, -1] - 1 n_features = data_inputs.shape[1] - 1 n_classes = 10 p = Pipeline([ TrainOnlyWrapper(DataShuffler()), ColumnTransformerInputOutput( input_columns=[( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ToNumpy(np.float32) )], output_columns=[(11, Identity())] ), OutputTransformerWrapper(PlotDistribution(column=-1)), MiniBatchSequentialPipeline([ Tensorflow2ModelStep( create_model=create_model, create_loss=create_loss, create_optimizer=create_optimizer ) \ .set_hyperparams(HyperparameterSamples({ 'n_dense_layers': 2, 'input_dim': n_features, 'optimizer': 'adam', 'activation': 'relu', 'kernel_initializer': 'he_uniform', 'learning_rate': 0.01, 'hidden_dim': 20, 'n_classes': 3 })).set_hyperparams_space(HyperparameterSpace({ 'n_dense_layers': RandInt(2, 4), 'hidden_dim_layer_multiplier': Uniform(0.30, 1), 'input_dim': FixedHyperparameter(n_features), 'optimizer': Choice([ OPTIMIZERS.ADAM.value, OPTIMIZERS.SGD.value, OPTIMIZERS.ADAGRAD.value ]), 'activation': Choice([ ACTIVATIONS.RELU.value, ACTIVATIONS.TANH.value, ACTIVATIONS.SIGMOID.value, ACTIVATIONS.ELU.value, ]), 'kernel_initializer': Choice([ KERNEL_INITIALIZERS.GLOROT_NORMAL.value, KERNEL_INITIALIZERS.GLOROT_UNIFORM.value, KERNEL_INITIALIZERS.HE_UNIFORM.value ]), 'learning_rate': LogUniform(0.005, 0.01), 'hidden_dim': RandInt(3, 80), 'n_classes': FixedHyperparameter(n_classes) })) ], batch_size=33), OutputTransformerWrapper(Pipeline([ ExpandDim(), OneHotEncoder(nb_columns=n_classes, name='classes') ])) ]) auto_ml = AutoML( pipeline=p, hyperparams_repository=InMemoryHyperparamsRepository( cache_folder='trials'), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(test_size=0.30), scoring_callback=ScoringCallback(accuracy, higher_score_is_better=True), callbacks=[ MetricCallback( name='classification_report_imbalanced_metric', metric_function=classificaiton_report_imbalanced_metric, higher_score_is_better=True), MetricCallback(name='f1', metric_function=f1_score_weighted, higher_score_is_better=True), MetricCallback(name='recall', metric_function=recall_score_weighted, higher_score_is_better=True), MetricCallback(name='precision', metric_function=precision_score_weighted, higher_score_is_better=True), EarlyStoppingCallback(max_epochs_without_improvement=3) ], n_trials=200, refit_trial=True, epochs=75) auto_ml = auto_ml.fit(data_inputs=data_inputs)
def main(chosen_device): exercice_number = 1 print('exercice {}\n=================='.format(exercice_number)) data_inputs, expected_outputs = generate_data( # See: https://github.com/guillaume-chevalier/seq2seq-signal-prediction/blob/master/datasets.py exercice_number=exercice_number, n_samples=None, window_size_past=None, window_size_future=None) print('data_inputs shape: {} => (n_samples, window_size_past, input_dim)'. format(data_inputs.shape)) print( 'expected_outputs shape: {} => (n_samples, window_size_future, output_dim)' .format(expected_outputs.shape)) sequence_length = data_inputs.shape[1] input_dim = data_inputs.shape[2] output_dim = expected_outputs.shape[2] batch_size = 100 epochs = 3 validation_size = 0.15 max_plotted_validation_predictions = 10 seq2seq_pipeline_hyperparams = HyperparameterSamples({ 'hidden_dim': 100, 'layers_stacked_count': 2, 'lambda_loss_amount': 0.0003, 'learning_rate': 0.006, 'window_size_future': sequence_length, 'output_dim': output_dim, 'input_dim': input_dim }) feature_0_metric = metric_3d_to_2d_wrapper(mean_squared_error) metrics = {'mse': feature_0_metric} signal_prediction_pipeline = Pipeline([ ForEachDataInput(MeanStdNormalizer()), ToNumpy(), PlotPredictionsWrapper( Tensorflow2ModelStep( # See: https://github.com/Neuraxio/Neuraxle-TensorFlow create_model=create_model, create_loss=create_loss, create_optimizer=create_optimizer, expected_outputs_dtype=tf.dtypes.float32, data_inputs_dtype=tf.dtypes.float32, print_loss=True).set_hyperparams(seq2seq_pipeline_hyperparams)) ]).set_name('SignalPrediction') pipeline = Pipeline([ EpochRepeater(ValidationSplitWrapper( MetricsWrapper(Pipeline([ TrainOnlyWrapper(DataShuffler()), MiniBatchSequentialPipeline([ MetricsWrapper(signal_prediction_pipeline, metrics=metrics, name='batch_metrics') ], batch_size=batch_size) ]), metrics=metrics, name='epoch_metrics', print_metrics=True), test_size=validation_size, scoring_function=feature_0_metric), epochs=epochs) ]) pipeline, outputs = pipeline.fit_transform(data_inputs, expected_outputs) plot_metrics(pipeline=pipeline, exercice_number=exercice_number) plot_predictions(data_inputs, expected_outputs, pipeline, max_plotted_validation_predictions)