Beispiel #1
0
    def __init__(
            self,
            steps: List[QueuedPipelineStepsTuple],
            batch_size,
            n_workers_per_step=None,
            max_queue_size=None,
            data_joiner=None,
            use_threading=False,
            use_savers=False,
            cache_folder=None
    ):
        NonFittableMixin.__init__(self)
        CustomPipelineMixin.__init__(self)

        if data_joiner is None:
            data_joiner = NumpyConcatenateOuterBatch()
        self.data_joiner = data_joiner
        self.max_queue_size = max_queue_size
        self.batch_size = batch_size
        self.n_workers_per_step = n_workers_per_step
        self.use_threading = use_threading
        self.use_savers = use_savers

        MiniBatchSequentialPipeline.__init__(self, steps=self._initialize_steps_as_tuple(steps),
                                             cache_folder=cache_folder)
        self._refresh_steps()
Beispiel #2
0
def main():
    """
    The task is to sleep 0.02 seconds for each data input and then multiply by 2.
    """
    sleep_time = 0.02
    preprocessing_and_model_steps = [ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]

    # Classical pipeline - all at once with one big batch:
    p = Pipeline(preprocessing_and_model_steps)
    time_vanilla_pipeline, output_classical = eval_run_time(p)
    print(f"Classical 'Pipeline' execution time: {time_vanilla_pipeline} seconds.")

    # Classical minibatch pipeline - minibatch size 10:
    p = MiniBatchSequentialPipeline(preprocessing_and_model_steps,
                                    batch_size=10)
    time_minibatch_pipeline, output_minibatch = eval_run_time(p)
    print(f"Minibatched 'MiniBatchSequentialPipeline' execution time: {time_minibatch_pipeline} seconds.")

    # Parallel pipeline - minibatch size 10 with 16 parallel workers per step that
    # have a max queue size of 10 batches between preprocessing and the model:
    p = SequentialQueuedPipeline(preprocessing_and_model_steps,
                                 n_workers_per_step=16, max_queue_size=10, batch_size=10)
    time_parallel_pipeline, output_parallel = eval_run_time(p)
    print(f"Parallel 'SequentialQueuedPipeline' execution time: {time_parallel_pipeline} seconds.")

    assert time_parallel_pipeline < time_minibatch_pipeline, str((time_parallel_pipeline, time_vanilla_pipeline))
    assert np.array_equal(output_classical, output_minibatch)
    assert np.array_equal(output_classical, output_parallel)
Beispiel #3
0
    def __init__(
        self,
        steps: List[QueuedPipelineStepsTuple],
        batch_size: int,
        n_workers_per_step: int = None,
        max_queue_size: int = None,
        data_joiner=None,
        use_threading: bool = False,
        use_savers: bool = False,
        include_incomplete_batch: bool = False,
        default_value_data_inputs: Union[Any, AbsentValuesNullObject] = None,
        default_value_expected_outputs: Union[Any,
                                              AbsentValuesNullObject] = None,
        cache_folder: str = None,
    ):
        if data_joiner is None:
            data_joiner = NumpyConcatenateOuterBatch()
        self.data_joiner = data_joiner
        self.max_queue_size = max_queue_size
        self.batch_size = batch_size
        self.n_workers_per_step = n_workers_per_step
        self.use_threading = use_threading
        self.use_savers = use_savers

        self.batch_size: int = batch_size
        self.include_incomplete_batch: bool = include_incomplete_batch
        self.default_value_data_inputs: Union[
            Any, AbsentValuesNullObject] = default_value_data_inputs
        self.default_value_expected_outputs: Union[
            Any, AbsentValuesNullObject] = default_value_expected_outputs

        MiniBatchSequentialPipeline.__init__(
            self,
            steps=self._initialize_steps_as_tuple(steps),
            cache_folder=cache_folder,
            batch_size=batch_size,
            include_incomplete_batch=include_incomplete_batch,
            default_value_data_inputs=default_value_data_inputs,
            default_value_expected_outputs=default_value_expected_outputs)
        self._refresh_steps()
def test_mini_batch_sequential_pipeline_should_transform_steps_sequentially_for_each_barrier_for_each_batch(
):
    # Given
    tape1 = TapeCallbackFunction()
    tape2 = TapeCallbackFunction()
    tape3 = TapeCallbackFunction()
    tape4 = TapeCallbackFunction()
    p = MiniBatchSequentialPipeline([
        MultiplyBy2TransformCallbackStep(tape1, ["1"]),
        MultiplyBy2TransformCallbackStep(tape2, ["2"]),
        Joiner(batch_size=10),
        MultiplyBy2TransformCallbackStep(tape3, ["3"]),
        MultiplyBy2TransformCallbackStep(tape4, ["4"]),
        Joiner(batch_size=10)
    ])

    # When
    outputs = p.transform(list(range(20)))

    # Then
    assert outputs == [
        0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
        256, 272, 288, 304
    ]

    assert tape1.data == [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
    assert tape1.name_tape == ["1", "1"]

    assert tape2.data == [[0, 2, 4, 6, 8, 10, 12, 14, 16, 18],
                          [20, 22, 24, 26, 28, 30, 32, 34, 36, 38]]
    assert tape2.name_tape == ["2", "2"]

    assert tape3.data == [[0, 4, 8, 12, 16, 20, 24, 28, 32, 36],
                          [40, 44, 48, 52, 56, 60, 64, 68, 72, 76]]
    assert tape3.name_tape == ["3", "3"]

    assert tape4.data == [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72],
                          [80, 88, 96, 104, 112, 120, 128, 136, 144, 152]]
    assert tape4.name_tape == ["4", "4"]
Beispiel #5
0
    def _create_mini_batch_pipeline(self, wrapped: BaseStep) -> BaseStep:
        """
        Add mini batching and batch metrics by wrapping the step with :class:`MetricsWrapper`, and  :class:̀MiniBatchSequentialPipeline`.

        :param wrapped: pipeline step
        :type wrapped: BaseStep
        :return: wrapped pipeline step
        :rtype: MetricsWrapper
        """
        if self.batch_size is not None:
            wrapped = MetricsWrapper(wrapped=wrapped, metrics=self.batch_metrics, name=BATCH_METRICS_STEP_NAME, print_metrics=self.print_batch_metrics)
            wrapped = MiniBatchSequentialPipeline(
                [wrapped],
                batch_size=self.batch_size
            )

        return wrapped
def test_minibatch_sequential_pipeline_change_batch_size_works():
    tape1 = TapeCallbackFunction()
    tape1_fit = TapeCallbackFunction()
    tape2 = TapeCallbackFunction()
    tape2_fit = TapeCallbackFunction()

    p = MiniBatchSequentialPipeline([
        MultiplyBy2FitTransformCallbackStep(tape1, tape1_fit, ["1"]),
        Joiner(batch_size=10),
        MultiplyBy2FitTransformCallbackStep(tape2, tape2_fit, ["2"]),
        Joiner(batch_size=10)
    ])

    # When
    p, outputs = p.fit_transform(list(range(20)), list(range(20)))
    p.set_batch_size(5)
    p, outputs = p.fit_transform(list(range(20, 30)), list(range(20, 30)))

    # Then

    assert tape1.data == [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
                          [20, 21, 22, 23, 24], [25, 26, 27, 28, 29]]
    assert tape1_fit.data == [([0, 1, 2, 3, 4, 5, 6, 7, 8,
                                9], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                              ([10, 11, 12, 13, 14, 15, 16, 17, 18,
                                19], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
                              ([20, 21, 22, 23, 24], [20, 21, 22, 23, 24]),
                              ([25, 26, 27, 28, 29], [25, 26, 27, 28, 29])]
    assert tape1.name_tape == ["1", "1", "1", "1"]

    assert tape2.data == [[0, 2, 4, 6, 8, 10, 12, 14, 16, 18],
                          [20, 22, 24, 26, 28, 30, 32, 34, 36, 38],
                          [40, 42, 44, 46, 48], [50, 52, 54, 56, 58]]
    assert tape2_fit.data == [([0, 2, 4, 6, 8, 10, 12, 14, 16,
                                18], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                              ([20, 22, 24, 26, 28, 30, 32, 34, 36,
                                38], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
                              ([40, 42, 44, 46, 48], [20, 21, 22, 23, 24]),
                              ([50, 52, 54, 56, 58], [25, 26, 27, 28, 29])]
    assert tape2.name_tape == ["2", "2", "2", "2"]
Beispiel #7
0
def main(chosen_device):
    exercice_number = 1
    print('exercice {}\n=================='.format(exercice_number))

    data_inputs, expected_outputs = generate_data(
        # See: https://github.com/guillaume-chevalier/seq2seq-signal-prediction/blob/master/datasets.py
        exercice_number=exercice_number,
        n_samples=None,
        window_size_past=None,
        window_size_future=None)

    print('data_inputs shape: {} => (n_samples, window_size_past, input_dim)'.
          format(data_inputs.shape))
    print(
        'expected_outputs shape: {} => (n_samples, window_size_future, output_dim)'
        .format(expected_outputs.shape))

    sequence_length = data_inputs.shape[1]
    input_dim = data_inputs.shape[2]
    output_dim = expected_outputs.shape[2]

    batch_size = 100
    epochs = 3
    validation_size = 0.15
    max_plotted_validation_predictions = 10

    seq2seq_pipeline_hyperparams = HyperparameterSamples({
        'hidden_dim':
        100,
        'layers_stacked_count':
        2,
        'lambda_loss_amount':
        0.0003,
        'learning_rate':
        0.006,
        'window_size_future':
        sequence_length,
        'output_dim':
        output_dim,
        'input_dim':
        input_dim
    })

    pipeline = Pipeline([
        MiniBatchSequentialPipeline(
            [
                ForEachDataInput(MeanStdNormalizer()),
                ToNumpy(),
                PlotPredictionsWrapper(
                    Tensorflow2ModelStep(
                        # See: https://github.com/Neuraxio/Neuraxle-TensorFlow
                        create_model=create_model,
                        create_loss=create_loss,
                        create_optimizer=create_optimizer,
                        expected_outputs_dtype=tf.dtypes.float32,
                        data_inputs_dtype=tf.dtypes.float32,
                        device_name=chosen_device,
                        print_loss=True).set_hyperparams(
                            seq2seq_pipeline_hyperparams))
            ],
            batch_size=batch_size),
    ]).set_name('SignalPrediction')

    trainer = Trainer(
        epochs=epochs,
        validation_splitter=ValidationSplitter(test_size=validation_size),
        scoring_callback=ScoringCallback(
            metric_function=metric_3d_to_2d_wrapper(mean_squared_error),
            higher_score_is_better=False))

    trial: Trial = trainer.train(pipeline=pipeline,
                                 data_inputs=data_inputs,
                                 expected_outputs=expected_outputs)

    plot_metrics(
        metric_name='mse',
        train_values=trial.validation_splits[0].metrics_results['main']
        ['train_values'],
        validation_values=trial.validation_splits[0].metrics_results['main']
        ['validation_values'],
        exercice_number=exercice_number)

    # Get trained pipeline
    pipeline = trial.get_trained_pipeline(split_number=0)

    # Get validation set with trainer.validation_split_function.split function.
    _, _, data_inputs_validation, expected_outputs_validation = trainer.validation_split_function.split(
        data_inputs=data_inputs, expected_outputs=expected_outputs)

    # Enable the plotting feature inside the PlotPredictionsWrapper wrapper step.
    pipeline.apply('toggle_plotting')
    pipeline.apply(method='set_max_plotted_predictions',
                   max_plotted_predictions=max_plotted_validation_predictions)

    # Transform the trained pipeline to plot predictions
    pipeline.transform_data_container(
        DataContainer(data_inputs=data_inputs_validation[0],
                      expected_outputs=expected_outputs_validation[0]))
Beispiel #8
0
def main():
    def accuracy(data_inputs, expected_outputs):
        return np.mean(
            np.argmax(np.array(data_inputs), axis=1) == np.argmax(
                np.array(expected_outputs), axis=1))

    # load the dataset
    df = read_csv('data/winequality-white.csv', sep=';')
    data_inputs = df.values
    data_inputs[:, -1] = data_inputs[:, -1] - 1
    n_features = data_inputs.shape[1] - 1
    n_classes = 10

    p = Pipeline([
        TrainOnlyWrapper(DataShuffler()),
        ColumnTransformerInputOutput(
            input_columns=[(
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ToNumpy(np.float32)
            )],
            output_columns=[(11, Identity())]
        ),
        OutputTransformerWrapper(PlotDistribution(column=-1)),
        MiniBatchSequentialPipeline([
            Tensorflow2ModelStep(
                create_model=create_model,
                create_loss=create_loss,
                create_optimizer=create_optimizer
            ) \
                .set_hyperparams(HyperparameterSamples({
                'n_dense_layers': 2,
                'input_dim': n_features,
                'optimizer': 'adam',
                'activation': 'relu',
                'kernel_initializer': 'he_uniform',
                'learning_rate': 0.01,
                'hidden_dim': 20,
                'n_classes': 3
            })).set_hyperparams_space(HyperparameterSpace({
                'n_dense_layers': RandInt(2, 4),
                'hidden_dim_layer_multiplier': Uniform(0.30, 1),
                'input_dim': FixedHyperparameter(n_features),
                'optimizer': Choice([
                    OPTIMIZERS.ADAM.value,
                    OPTIMIZERS.SGD.value,
                    OPTIMIZERS.ADAGRAD.value
                ]),
                'activation': Choice([
                    ACTIVATIONS.RELU.value,
                    ACTIVATIONS.TANH.value,
                    ACTIVATIONS.SIGMOID.value,
                    ACTIVATIONS.ELU.value,
                ]),
                'kernel_initializer': Choice([
                    KERNEL_INITIALIZERS.GLOROT_NORMAL.value,
                    KERNEL_INITIALIZERS.GLOROT_UNIFORM.value,
                    KERNEL_INITIALIZERS.HE_UNIFORM.value
                ]),
                'learning_rate': LogUniform(0.005, 0.01),
                'hidden_dim': RandInt(3, 80),
                'n_classes': FixedHyperparameter(n_classes)
            }))
        ], batch_size=33),
        OutputTransformerWrapper(Pipeline([
            ExpandDim(),
            OneHotEncoder(nb_columns=n_classes, name='classes')
        ]))
    ])

    auto_ml = AutoML(
        pipeline=p,
        hyperparams_repository=InMemoryHyperparamsRepository(
            cache_folder='trials'),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(test_size=0.30),
        scoring_callback=ScoringCallback(accuracy,
                                         higher_score_is_better=True),
        callbacks=[
            MetricCallback(
                name='classification_report_imbalanced_metric',
                metric_function=classificaiton_report_imbalanced_metric,
                higher_score_is_better=True),
            MetricCallback(name='f1',
                           metric_function=f1_score_weighted,
                           higher_score_is_better=True),
            MetricCallback(name='recall',
                           metric_function=recall_score_weighted,
                           higher_score_is_better=True),
            MetricCallback(name='precision',
                           metric_function=precision_score_weighted,
                           higher_score_is_better=True),
            EarlyStoppingCallback(max_epochs_without_improvement=3)
        ],
        n_trials=200,
        refit_trial=True,
        epochs=75)

    auto_ml = auto_ml.fit(data_inputs=data_inputs)
Beispiel #9
0
def main(chosen_device):
    exercice_number = 1
    print('exercice {}\n=================='.format(exercice_number))

    data_inputs, expected_outputs = generate_data(
        # See: https://github.com/guillaume-chevalier/seq2seq-signal-prediction/blob/master/datasets.py
        exercice_number=exercice_number,
        n_samples=None,
        window_size_past=None,
        window_size_future=None)

    print('data_inputs shape: {} => (n_samples, window_size_past, input_dim)'.
          format(data_inputs.shape))
    print(
        'expected_outputs shape: {} => (n_samples, window_size_future, output_dim)'
        .format(expected_outputs.shape))

    sequence_length = data_inputs.shape[1]
    input_dim = data_inputs.shape[2]
    output_dim = expected_outputs.shape[2]

    batch_size = 100
    epochs = 3
    validation_size = 0.15
    max_plotted_validation_predictions = 10

    seq2seq_pipeline_hyperparams = HyperparameterSamples({
        'hidden_dim':
        100,
        'layers_stacked_count':
        2,
        'lambda_loss_amount':
        0.0003,
        'learning_rate':
        0.006,
        'window_size_future':
        sequence_length,
        'output_dim':
        output_dim,
        'input_dim':
        input_dim
    })
    feature_0_metric = metric_3d_to_2d_wrapper(mean_squared_error)
    metrics = {'mse': feature_0_metric}

    signal_prediction_pipeline = Pipeline([
        ForEachDataInput(MeanStdNormalizer()),
        ToNumpy(),
        PlotPredictionsWrapper(
            Tensorflow2ModelStep(
                # See: https://github.com/Neuraxio/Neuraxle-TensorFlow
                create_model=create_model,
                create_loss=create_loss,
                create_optimizer=create_optimizer,
                expected_outputs_dtype=tf.dtypes.float32,
                data_inputs_dtype=tf.dtypes.float32,
                print_loss=True).set_hyperparams(seq2seq_pipeline_hyperparams))
    ]).set_name('SignalPrediction')

    pipeline = Pipeline([
        EpochRepeater(ValidationSplitWrapper(
            MetricsWrapper(Pipeline([
                TrainOnlyWrapper(DataShuffler()),
                MiniBatchSequentialPipeline([
                    MetricsWrapper(signal_prediction_pipeline,
                                   metrics=metrics,
                                   name='batch_metrics')
                ],
                                            batch_size=batch_size)
            ]),
                           metrics=metrics,
                           name='epoch_metrics',
                           print_metrics=True),
            test_size=validation_size,
            scoring_function=feature_0_metric),
                      epochs=epochs)
    ])

    pipeline, outputs = pipeline.fit_transform(data_inputs, expected_outputs)

    plot_metrics(pipeline=pipeline, exercice_number=exercice_number)
    plot_predictions(data_inputs, expected_outputs, pipeline,
                     max_plotted_validation_predictions)