Example #1
0
    def split_data_container(self, data_container) -> Tuple[DataContainer, DataContainer]:
        """
        Split data container into a training set, and a validation set.

        :param data_container: data container
        :type data_container: DataContainer
        :return: train_data_container, validation_data_container
        """

        train_data_inputs, train_expected_outputs, validation_data_inputs, validation_expected_outputs = \
            self.split(data_container.data_inputs, data_container.expected_outputs)

        train_ids = self.train_split(data_container.current_ids)
        train_data_container = DataContainer(data_inputs=train_data_inputs, current_ids=train_ids,
                                             summary_id=data_container.summary_id,
                                             expected_outputs=train_expected_outputs)

        validation_ids = self.validation_split(data_container.current_ids)
        validation_data_container = DataContainer(data_inputs=validation_data_inputs, current_ids=validation_ids,
                                                  summary_id=data_container.summary_id,
                                                  expected_outputs=validation_expected_outputs)

        return train_data_container, validation_data_container
Example #2
0
def test_fit_transform_input_and_output_transformer_wrapper_with_hashlib_md5_value_hasher():
    step = InputAndOutputTransformerWrapper(WindowTimeSeriesForOutputTransformerWrapper()) \
        .set_hashers([HashlibMd5ValueHasher()])

    step, data_container = step.handle_fit_transform(
        data_container=DataContainer(
            data_inputs=np.array(list(range(10))),
            expected_outputs=np.array(list(range(10)))
        ),
        context=ExecutionContext()
    )

    assert np.array_equal(data_container.data_inputs, np.array(list(range(0, 5))))
    assert np.array_equal(data_container.expected_outputs, np.array(list(range(5, 10))))
Example #3
0
    def inverse_transform(self, processed_outputs) -> Any:
        """
        After transforming all data inputs, and obtaining a prediction, we can inverse transform the processed outputs

        :param processed_outputs: the forward transformed data input
        :return: backward transformed processed outputs
        """
        data_container = DataContainer(data_inputs=processed_outputs)
        context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.INVERSE_TRANSFORM)

        for step_name, step in list(reversed(self.items())):
            data_container = step.handle_inverse_transform(data_container, context)

        return data_container.data_inputs
    def handle_inverse_transform(self, data_container: DataContainer,
                                 context: ExecutionContext) -> DataContainer:
        """
        Handle inverse transform by passing expected outputs to the wrapped step inverse transform method.
        Update the expected outputs with the outputs.

        :param context: execution context
        :param data_container:
        :return: data container
        :rtype: DataContainer
        """
        new_expected_outputs_data_container = self.wrapped.handle_inverse_transform(
            DataContainer(current_ids=data_container.current_ids,
                          data_inputs=self._get_data_inputs(data_container),
                          expected_outputs=None), context.push(self.wrapped))

        data_container.set_expected_outputs(
            new_expected_outputs_data_container.data_inputs)

        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        return data_container
Example #5
0
    def fit_transform(self,
                      data_inputs,
                      expected_outputs=None) -> ('Pipeline', Any):
        """
        After loading the last checkpoint, fit transform each pipeline steps

        :param data_inputs: the data input to fit on
        :param expected_outputs: the expected data output to fit on
        :return: the pipeline itself
        """
        data_container = DataContainer(current_ids=None,
                                       data_inputs=data_inputs,
                                       expected_outputs=expected_outputs)
        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        context = ExecutionContext.create_from_root(
            self, ExecutionMode.FIT_TRANSFORM, self.cache_folder)

        new_self, data_container = self._fit_transform_core(
            data_container, context)

        return new_self, data_container.data_inputs
Example #6
0
def test_inner_concatenate_data_should_merge_1d_with_2d():
    # Given
    data_inputs_2d, expected_outputs_2d = _create_data_source(SHAPE_2D)
    data_inputs_1d, expected_outputs_1d = _create_data_source(SHAPE_1D)
    data_container_1d = DataContainer(data_inputs=data_inputs_1d,
                                      expected_outputs=expected_outputs_1d)
    data_container = DataContainer(data_inputs=data_inputs_2d, expected_outputs=expected_outputs_2d) \
        .add_sub_data_container('1d', data_container_1d)

    # When
    p = Pipeline(
        [InnerConcatenateDataContainer(sub_data_container_names=['1d'])])

    data_container = p.handle_transform(data_container, ExecutionContext())

    # Then
    assert data_container.data_inputs.shape == (SHAPE_2D[0], SHAPE_2D[1] + 1)
    assert data_container.expected_outputs.shape == (SHAPE_2D[0],
                                                     SHAPE_2D[1] + 1)
    assert np.array_equal(data_container.data_inputs[..., -1],
                          data_container_1d.data_inputs)
    assert np.array_equal(data_container.expected_outputs[..., -1],
                          data_container_1d.expected_outputs)
Example #7
0
    def transform(self, data_inputs: Any):
        """
        :param data_inputs: the data input to transform
        :return: transformed data inputs
        """
        data_container = DataContainer(data_inputs=data_inputs,
                                       current_ids=None)

        self.hash_data_container(data_container)

        context = ExecutionContext(self.cache_folder, ExecutionMode.TRANSFORM)
        data_container = self.handle_transform(data_container, context)

        return data_container.data_inputs
Example #8
0
    def fit_data_container(self, data_container: DataContainer,
                           context: ExecutionContext) -> BaseStep:
        """
        Fit all sub pipelines splitted by the Barrier steps.
        :param data_container: data container to transform.
        :param context: execution context
        :return: data container
        """
        data_container, context = self._will_process(data_container, context)
        data_container, context = self._will_transform_data_container(
            data_container, context)

        sub_pipelines = self._create_sub_pipelines()
        index_start = 0

        for sub_pipeline in sub_pipelines:
            sub_pipeline.setup()

            barrier = sub_pipeline[-1]
            sub_pipeline, data_container = barrier.join_fit_transform(
                step=sub_pipeline,
                data_container=data_container,
                context=context)
            current_ids = self.hash(data_container)
            data_container.set_current_ids(current_ids)

            new_self = self[:index_start] + sub_pipeline
            if index_start + len(sub_pipeline) < len(self):
                new_self += self[index_start + len(sub_pipeline):]

            self.steps_as_tuple = new_self.steps_as_tuple
            index_start += len(sub_pipeline)

        data_container = self._did_fit(data_container, context)
        self._did_process(data_container, context)

        return self
Example #9
0
    def split_data_container(self, data_container: DataContainer, context: ExecutionContext) -> List[
        Tuple[DataContainer, DataContainer]]:
        """
        Wrap a validation split function with a split data container function.
        A validation split function takes two arguments:  data inputs, and expected outputs.

        :param data_container: data container to split
        :return: a function that returns the pairs of training, and validation data containers for each validation split.
        """
        train_data_inputs, train_expected_outputs, validation_data_inputs, validation_expected_outputs = self.split(
            data_inputs=data_container.data_inputs,
            expected_outputs=data_container.expected_outputs,
            context=context
        )

        train_data_container = DataContainer(data_inputs=train_data_inputs, expected_outputs=train_expected_outputs)
        validation_data_container = DataContainer(data_inputs=validation_data_inputs,
                                                  expected_outputs=validation_expected_outputs)

        splits = []
        for (train_current_id, train_di, train_eo), (validation_current_id, validation_di, validation_eo) in zip(
                train_data_container, validation_data_container):
            train_data_container_split = DataContainer(
                summary_id=train_current_id,
                data_inputs=train_di,
                expected_outputs=train_eo
            )

            validation_data_container_split = DataContainer(
                summary_id=validation_current_id,
                data_inputs=validation_di,
                expected_outputs=validation_eo
            )

            splits.append((train_data_container_split, validation_data_container_split))

        return splits
Example #10
0
    def fit_trial_split(self, trial_split: TrialSplit,
                        train_data_container: DataContainer,
                        validation_data_container: DataContainer,
                        context: ExecutionContext) -> TrialSplit:
        """
        Train pipeline using the training data container.
        Track training, and validation metrics for each epoch.

        :param train_data_container: train data container
        :param validation_data_container: validation data container
        :param trial_split: trial to execute
        :param context: execution context

        :return: executed trial
        """
        early_stopping = False

        for i in range(self.epochs):
            self.print_func('\nepoch {}/{}'.format(i + 1, self.epochs))
            trial_split = trial_split.fit_trial_split(
                train_data_container.copy(), context)
            y_pred_train = trial_split.predict_with_pipeline(
                train_data_container.copy(), context)
            y_pred_val = trial_split.predict_with_pipeline(
                validation_data_container.copy(), context)

            if self.callbacks.call(trial=trial_split,
                                   epoch_number=i,
                                   total_epochs=self.epochs,
                                   input_train=train_data_container,
                                   pred_train=y_pred_train,
                                   input_val=validation_data_container,
                                   pred_val=y_pred_val,
                                   is_finished_and_fitted=early_stopping):
                break

        return trial_split
Example #11
0
    def _fit_data_container(
            self, data_container: DataContainer,
            context: ExecutionContext) -> (BaseStep, DataContainer):
        """
        Handle fit by passing the data inputs, and the expected outputs to the wrapped step fit method.

        :param context: execution context
        :type context: ExecutionContext
        :param data_container: data container to fit on
        :return: self, data container
        :rtype: (BaseStep, DataContainer)
        """
        self.wrapped = self.wrapped.handle_fit(
            DataContainer(data_inputs=(copy.copy(data_container.data_inputs),
                                       copy.copy(
                                           data_container.expected_outputs)),
                          current_ids=data_container.current_ids,
                          expected_outputs=None), context)

        data_container.set_data_inputs(
            (data_container.data_inputs, data_container.expected_outputs))
        data_container.set_expected_outputs(expected_outputs=None)

        return self
Example #12
0
    def handle_fit_transform(
            self, data_container: DataContainer,
            context: ExecutionContext) -> ('BaseStep', DataContainer):
        """
        Fit transform data container.

        :param context: execution context
        :param data_container: the data container to transform
        :type data_container: neuraxle.data_container.DataContainer

        :return: tuple(fitted pipeline, data_container)
        """
        self.create_checkpoint_path(context.get_path())
        self.flush_cache()
        self.wrapped = self.wrapped.fit(data_container.data_inputs,
                                        data_container.expected_outputs)
        outputs = self._transform_with_cache(data_container)

        data_container.set_data_inputs(outputs)

        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        return self, data_container
Example #13
0
def test_outer_concatenate_data_should_merge_2d_with_3d():
    # Given
    data_inputs_3d, expected_outputs_3d = _create_data_source(SHAPE_3D)
    data_inputs_2d, expected_outputs_2d = _create_data_source(SHAPE_2D)
    data_container_2d = DataContainer(data_inputs=data_inputs_2d,
                                      expected_outputs=expected_outputs_2d)
    data_container = DataContainer(data_inputs=data_inputs_3d, expected_outputs=expected_outputs_3d) \
        .add_sub_data_container('2d', data_container_2d)

    # When
    p = Pipeline([ZipBatchDataContainer(sub_data_container_names=['2d'])])

    data_container = p.handle_transform(data_container, ExecutionContext())

    # Then
    for i, (first_di,
            second_di) in enumerate(zip(data_inputs_3d, data_inputs_2d)):
        assert np.array_equal(data_container.data_inputs[i][0], first_di)
        assert np.array_equal(data_container.data_inputs[i][1], second_di)

    for i, (first_eo, second_eo) in enumerate(
            zip(expected_outputs_3d, expected_outputs_2d)):
        assert np.array_equal(data_container.expected_outputs[i][0], first_eo)
        assert np.array_equal(data_container.expected_outputs[i][1], second_eo)
Example #14
0
def test_output_transformer_wrapper_should_fit_transform_with_data_inputs_and_expected_outputs():
    tape = TapeCallbackFunction()
    p = OutputTransformerWrapper(Pipeline([MultiplyByN(2), FitCallbackStep(tape)]))
    data_inputs, expected_outputs = _create_data_source((10, 10))

    p, data_container = p.handle_fit_transform(DataContainer(
        data_inputs=data_inputs,
        expected_outputs=expected_outputs
    ), ExecutionContext())

    assert np.array_equal(data_container.data_inputs, data_inputs)
    assert np.array_equal(data_container.expected_outputs, expected_outputs * 2)
    assert np.array_equal(tape.data[0][0], expected_outputs * 2)
    for i in range(10):
        assert tape.data[0][1][i] is None
Example #15
0
    def fit_transform(self,
                      data_inputs,
                      expected_outputs=None) -> ('Pipeline', Any):
        """
        After loading the last checkpoint, fit transform each pipeline steps

        :param data_inputs: the data input to fit on
        :param expected_outputs: the expected data output to fit on
        :return: the pipeline itself
        """
        new_self, data_container = self.fit_transform_data_container(
            DataContainer(data_inputs=data_inputs,
                          current_ids=None,
                          expected_outputs=expected_outputs))
        return new_self, data_container.data_inputs
Example #16
0
    def transform(self, data_inputs: Any):
        """
        After loading the last checkpoint, transform each pipeline steps

        :param data_inputs: the data input to transform
        :return: transformed data inputs
        """
        data_container = DataContainer(current_ids=None, data_inputs=data_inputs)

        data_container = self.hash_data_container(data_container)
        context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.TRANSFORM)
        context = context.push(self)
        data_container = self._transform_data_container(data_container, context)

        return data_container.data_inputs
def test_flatten_for_each_should_transform_data_inputs_and_expected_outputs():
    p = FlattenForEach(Pipeline([
        MultiplyByN(2),
        OutputTransformerWrapper(MultiplyByN(3))
    ]))
    # TODO: should use a tape here and ensure that the MultiplyByN received a flat 12 shape only once and not 3*4 things
    data_inputs, expected_outputs = _create_random_of_shape(DATA_SHAPE)

    p, outputs = p.handle_fit_transform(
        DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs), ExecutionContext())

    assert np.array(outputs.data_inputs).shape == DATA_SHAPE
    assert np.array_equal(outputs.data_inputs, data_inputs * 2)
    assert np.array(outputs.expected_outputs).shape == DATA_SHAPE
    assert np.array_equal(outputs.expected_outputs, expected_outputs * 3)
Example #18
0
    def join_transform(self, step: TruncableSteps, data_container: DataContainer,
                       context: ExecutionContext) -> ZipDataContainer:
        context = context.push(step)
        data_container_batches = data_container.minibatches(
            batch_size=self.batch_size,
            keep_incomplete_batch=self.keep_incomplete_batch,
            default_value_data_inputs=self.default_value_data_inputs,
            default_value_expected_outputs=self.default_value_expected_outputs
        )

        output_data_container = []
        for data_container_batch in data_container_batches:
            output_data_container.append(step._transform_data_container(data_container_batch, context))

        return ZipDataContainer.create_from(*output_data_container)
Example #19
0
def test_handle_predict_should_predict_in_test_mode():
    tape_fit = TapeCallbackFunction()
    tape_transform = TapeCallbackFunction()
    p = Pipeline([
        TestOnlyWrapper(
            CallbackWrapper(MultiplyByN(2), tape_transform, tape_fit)),
        TrainOnlyWrapper(
            CallbackWrapper(MultiplyByN(4), tape_transform, tape_fit))
    ])

    data_container = p.handle_predict(data_container=DataContainer(
        data_inputs=np.array([1, 1]), expected_outputs=np.array([1, 1])),
                                      context=ExecutionContext())

    assert np.array_equal(data_container.data_inputs, np.array([2, 2]))
Example #20
0
    def _fit_data_container(self, data_container: DataContainer, context: ExecutionContext) -> 'BaseStep':
        """
        Fit wrapped step self.epochs times using wrapped step handle fit method.

        :param data_container: data container
        :type data_container: DataContainer
        :param context: execution context
        :type context: ExecutionContext
        :return: (fitted self, data container)
        :rtype: (BaseStep, DataContainer)
        """
        epochs = self._get_epochs()

        for _ in range(epochs):
            self.wrapped = self.wrapped.handle_fit(data_container.copy(), context)
        return self
Example #21
0
    def join_fit_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> \
            Tuple['Any', DataContainer]:
        context = context.push(step)
        data_container_batches = data_container.minibatches(
            batch_size=self.batch_size,
            keep_incomplete_batch=self.keep_incomplete_batch,
            default_value_data_inputs=self.default_value_data_inputs,
            default_value_expected_outputs=self.default_value_expected_outputs
        )

        output_data_container = []
        for data_container_batch in data_container_batches:
            step, data_container_batch = step._fit_transform_data_container(data_container_batch, context)
            output_data_container.append(data_container_batch)

        return step, ZipDataContainer.create_from(*output_data_container)
Example #22
0
    def fit_transform(self, data_inputs, expected_outputs=None) -> ('Pipeline', Any):
        """
        :param data_inputs: the data input to fit on
        :param expected_outputs: the expected data output to fit on
        :return: the pipeline itself
        """
        self.setup()

        data_container = DataContainer(current_ids=None, data_inputs=data_inputs, expected_outputs=expected_outputs)

        data_container = self.hash_data_container(data_container)

        context = ExecutionContext(self.cache_folder, ExecutionMode.FIT_TRANSFORM)
        new_self, data_container = self.handle_fit_transform(data_container, context)

        return new_self, data_container.data_inputs
Example #23
0
def test_joblib_checkpoint_step_should_load_data_container(tmpdir: LocalPath):
    initial_data_inputs = [1, 2]
    initial_expected_outputs = [2, 3]

    create_pipeline_output_transformer(tmpdir).fit_transform(
        data_inputs=initial_data_inputs,
        expected_outputs=initial_expected_outputs)

    actual_data_container = create_pipeline_output_transformer(
        tmpdir).handle_transform(
            DataContainer(data_inputs=initial_data_inputs,
                          current_ids=[0, 1],
                          expected_outputs=initial_expected_outputs),
            ExecutionContext(tmpdir))

    assert np.array_equal(actual_data_container.data_inputs, [4, 8])
    assert np.array_equal(actual_data_container.expected_outputs, [8, 12])
Example #24
0
def test_queued_pipeline_with_step_with_threading():
    p = SequentialQueuedPipeline(
        [MultiplyByN(2),
         MultiplyByN(2),
         MultiplyByN(2),
         MultiplyByN(2)],
        batch_size=10,
        n_workers_per_step=1,
        max_queue_size=5,
        use_processes=False)

    data_container = DataContainer(data_inputs=list(range(100)))
    context = ExecutionContext()

    outputs = p.handle_transform(data_container, context)

    assert np.array_equal(outputs.data_inputs, EXPECTED_OUTPUTS)
Example #25
0
    def test_logger_automl(self, tmpdir):
        # Given
        context = ExecutionContext()
        self.tmpdir = str(tmpdir)
        hp_repository = HyperparamsJSONRepository(cache_folder=self.tmpdir)
        n_epochs = 2
        n_trials = 4
        auto_ml = AutoML(
            pipeline=Pipeline([
                MultiplyByN(2).set_hyperparams_space(
                    HyperparameterSpace(
                        {'multiply_by': FixedHyperparameter(2)})),
                NumpyReshape(new_shape=(-1, 1)),
                LoggingStep()
            ]),
            hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(
            ),
            validation_splitter=ValidationSplitter(0.20),
            scoring_callback=ScoringCallback(mean_squared_error,
                                             higher_score_is_better=False),
            n_trials=n_trials,
            refit_trial=True,
            epochs=n_epochs,
            hyperparams_repository=hp_repository,
            continue_loop_on_error=False)

        # When
        data_container = DataContainer(
            data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
            expected_outputs=np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]))
        auto_ml.handle_fit(data_container, context)

        # Then
        file_paths = [
            os.path.join(hp_repository.cache_folder, f"trial_{i}.log")
            for i in range(n_trials)
        ]
        assert len(file_paths) == n_trials

        for f in file_paths:
            assert os.path.exists(f)

        for f in file_paths:
            with open(f, 'r') as f:
                log = f.readlines()
                assert len(log) == 36
Example #26
0
    def _transform_data_container(self, data_container: DataContainer,
                                  context: ExecutionContext) -> DataContainer:
        """
        Handle transform by passing data_inputs, and expected outputs to the wrapped step transform method.
        Update the expected outputs with the outputs.

        :param context: execution context
        :param data_container:
        :return: data container
        :rtype: DataContainer
        """
        output_data_container = self.wrapped.handle_transform(
            DataContainer(data_inputs=(data_container.data_inputs,
                                       data_container.expected_outputs),
                          current_ids=data_container.current_ids,
                          expected_outputs=None), context)

        return output_data_container
Example #27
0
    def _fit_transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> (
            'BaseStep', DataContainer):
        """
        Fit transform wrapped step self.epochs times using wrapped step handle fit transform method.

        :param data_container: data container
        :type data_container: DataContainer
        :param context: execution context
        :type context: ExecutionContext
        :return: (fitted self, data container)
        :rtype: (BaseStep, DataContainer)
        """
        if not self.fit_only:
            for _ in range(self.epochs - 1):
                self.wrapped = self.wrapped.handle_fit(data_container.copy(), context)

        self.wrapped, data_container = self.wrapped.handle_fit_transform(data_container, context)
        return self, data_container
Example #28
0
    def train(self,
              pipeline: BaseStep,
              data_inputs,
              expected_outputs=None,
              context: ExecutionContext = None,
              trial_number=0) -> Trial:
        """
        Train pipeline using the validation splitter.
        Track training, and validation metrics for each epoch.
        Note: the present method is just a shortcut to using the `execute_trial` method with less boilerplate code needed. Refer to `execute_trial` for full flexibility

        :param pipeline: pipeline to train on
        :param data_inputs: data inputs
        :param expected_outputs: expected ouptuts to fit on
        :return: executed trial

        """
        assert not (
            context is None
        )  # TODO: change order of arguments so that context isn't an optional argument

        validation_splits: List[Tuple[
            DataContainer,
            DataContainer]] = self.validation_split_function.split_data_container(
                DataContainer(data_inputs=data_inputs,
                              expected_outputs=expected_outputs),
                context=context)

        repo_trial: Trial = Trial(
            pipeline=pipeline,
            logger=context.logger,
            hyperparams=pipeline.get_hyperparams(),
            main_metric_name=self.get_main_metric_name(),
            save_trial_function=self.hyperparams_repository.save_trial,
            trial_number=trial_number)

        self.execute_trial(pipeline=pipeline,
                           repo_trial=repo_trial,
                           context=context,
                           validation_splits=validation_splits,
                           n_trial=1,
                           delete_pipeline_on_completion=False)

        return repo_trial
Example #29
0
    def _fit_transform_data_container(
            self, data_container: DataContainer,
            context: ExecutionContext) -> (BaseStep, DataContainer):
        """
        Handle fit transform by passing the data inputs, and the expected outputs to the wrapped step fit method.
        Update the expected outputs with the outputs.

        :param context: execution context
        :type context: ExecutionContext
        :param data_container: data container to fit on
        :return: self, data container
        :rtype: (BaseStep, DataContainer)
        """
        self.wrapped, output_data_container = self.wrapped.handle_fit_transform(
            DataContainer(data_inputs=(data_container.data_inputs,
                                       data_container.expected_outputs),
                          current_ids=data_container.current_ids,
                          expected_outputs=None), context)
        return self, output_data_container
Example #30
0
    def fit_transform(self, data_inputs, expected_outputs=None) -> ('Pipeline', Any):
        """
        After loading the last checkpoint, fit transform each pipeline steps

        :param data_inputs: the data input to fit on
        :param expected_outputs: the expected data output to fit on
        :return: the pipeline itself
        """
        data_container = DataContainer(
            current_ids=None,
            data_inputs=data_inputs,
            expected_outputs=expected_outputs
        )

        data_container = self.hash_data_container(data_container)
        context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.FIT_TRANSFORM)
        context = context.push(self)
        new_self, data_container = self._fit_transform_data_container(data_container, context)

        return new_self, data_container.data_inputs