def split_data_container(self, data_container) -> Tuple[DataContainer, DataContainer]: """ Split data container into a training set, and a validation set. :param data_container: data container :type data_container: DataContainer :return: train_data_container, validation_data_container """ train_data_inputs, train_expected_outputs, validation_data_inputs, validation_expected_outputs = \ self.split(data_container.data_inputs, data_container.expected_outputs) train_ids = self.train_split(data_container.current_ids) train_data_container = DataContainer(data_inputs=train_data_inputs, current_ids=train_ids, summary_id=data_container.summary_id, expected_outputs=train_expected_outputs) validation_ids = self.validation_split(data_container.current_ids) validation_data_container = DataContainer(data_inputs=validation_data_inputs, current_ids=validation_ids, summary_id=data_container.summary_id, expected_outputs=validation_expected_outputs) return train_data_container, validation_data_container
def test_fit_transform_input_and_output_transformer_wrapper_with_hashlib_md5_value_hasher(): step = InputAndOutputTransformerWrapper(WindowTimeSeriesForOutputTransformerWrapper()) \ .set_hashers([HashlibMd5ValueHasher()]) step, data_container = step.handle_fit_transform( data_container=DataContainer( data_inputs=np.array(list(range(10))), expected_outputs=np.array(list(range(10))) ), context=ExecutionContext() ) assert np.array_equal(data_container.data_inputs, np.array(list(range(0, 5)))) assert np.array_equal(data_container.expected_outputs, np.array(list(range(5, 10))))
def inverse_transform(self, processed_outputs) -> Any: """ After transforming all data inputs, and obtaining a prediction, we can inverse transform the processed outputs :param processed_outputs: the forward transformed data input :return: backward transformed processed outputs """ data_container = DataContainer(data_inputs=processed_outputs) context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.INVERSE_TRANSFORM) for step_name, step in list(reversed(self.items())): data_container = step.handle_inverse_transform(data_container, context) return data_container.data_inputs
def handle_inverse_transform(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ Handle inverse transform by passing expected outputs to the wrapped step inverse transform method. Update the expected outputs with the outputs. :param context: execution context :param data_container: :return: data container :rtype: DataContainer """ new_expected_outputs_data_container = self.wrapped.handle_inverse_transform( DataContainer(current_ids=data_container.current_ids, data_inputs=self._get_data_inputs(data_container), expected_outputs=None), context.push(self.wrapped)) data_container.set_expected_outputs( new_expected_outputs_data_container.data_inputs) current_ids = self.hash(data_container) data_container.set_current_ids(current_ids) return data_container
def fit_transform(self, data_inputs, expected_outputs=None) -> ('Pipeline', Any): """ After loading the last checkpoint, fit transform each pipeline steps :param data_inputs: the data input to fit on :param expected_outputs: the expected data output to fit on :return: the pipeline itself """ data_container = DataContainer(current_ids=None, data_inputs=data_inputs, expected_outputs=expected_outputs) current_ids = self.hash(data_container) data_container.set_current_ids(current_ids) context = ExecutionContext.create_from_root( self, ExecutionMode.FIT_TRANSFORM, self.cache_folder) new_self, data_container = self._fit_transform_core( data_container, context) return new_self, data_container.data_inputs
def test_inner_concatenate_data_should_merge_1d_with_2d(): # Given data_inputs_2d, expected_outputs_2d = _create_data_source(SHAPE_2D) data_inputs_1d, expected_outputs_1d = _create_data_source(SHAPE_1D) data_container_1d = DataContainer(data_inputs=data_inputs_1d, expected_outputs=expected_outputs_1d) data_container = DataContainer(data_inputs=data_inputs_2d, expected_outputs=expected_outputs_2d) \ .add_sub_data_container('1d', data_container_1d) # When p = Pipeline( [InnerConcatenateDataContainer(sub_data_container_names=['1d'])]) data_container = p.handle_transform(data_container, ExecutionContext()) # Then assert data_container.data_inputs.shape == (SHAPE_2D[0], SHAPE_2D[1] + 1) assert data_container.expected_outputs.shape == (SHAPE_2D[0], SHAPE_2D[1] + 1) assert np.array_equal(data_container.data_inputs[..., -1], data_container_1d.data_inputs) assert np.array_equal(data_container.expected_outputs[..., -1], data_container_1d.expected_outputs)
def transform(self, data_inputs: Any): """ :param data_inputs: the data input to transform :return: transformed data inputs """ data_container = DataContainer(data_inputs=data_inputs, current_ids=None) self.hash_data_container(data_container) context = ExecutionContext(self.cache_folder, ExecutionMode.TRANSFORM) data_container = self.handle_transform(data_container, context) return data_container.data_inputs
def fit_data_container(self, data_container: DataContainer, context: ExecutionContext) -> BaseStep: """ Fit all sub pipelines splitted by the Barrier steps. :param data_container: data container to transform. :param context: execution context :return: data container """ data_container, context = self._will_process(data_container, context) data_container, context = self._will_transform_data_container( data_container, context) sub_pipelines = self._create_sub_pipelines() index_start = 0 for sub_pipeline in sub_pipelines: sub_pipeline.setup() barrier = sub_pipeline[-1] sub_pipeline, data_container = barrier.join_fit_transform( step=sub_pipeline, data_container=data_container, context=context) current_ids = self.hash(data_container) data_container.set_current_ids(current_ids) new_self = self[:index_start] + sub_pipeline if index_start + len(sub_pipeline) < len(self): new_self += self[index_start + len(sub_pipeline):] self.steps_as_tuple = new_self.steps_as_tuple index_start += len(sub_pipeline) data_container = self._did_fit(data_container, context) self._did_process(data_container, context) return self
def split_data_container(self, data_container: DataContainer, context: ExecutionContext) -> List[ Tuple[DataContainer, DataContainer]]: """ Wrap a validation split function with a split data container function. A validation split function takes two arguments: data inputs, and expected outputs. :param data_container: data container to split :return: a function that returns the pairs of training, and validation data containers for each validation split. """ train_data_inputs, train_expected_outputs, validation_data_inputs, validation_expected_outputs = self.split( data_inputs=data_container.data_inputs, expected_outputs=data_container.expected_outputs, context=context ) train_data_container = DataContainer(data_inputs=train_data_inputs, expected_outputs=train_expected_outputs) validation_data_container = DataContainer(data_inputs=validation_data_inputs, expected_outputs=validation_expected_outputs) splits = [] for (train_current_id, train_di, train_eo), (validation_current_id, validation_di, validation_eo) in zip( train_data_container, validation_data_container): train_data_container_split = DataContainer( summary_id=train_current_id, data_inputs=train_di, expected_outputs=train_eo ) validation_data_container_split = DataContainer( summary_id=validation_current_id, data_inputs=validation_di, expected_outputs=validation_eo ) splits.append((train_data_container_split, validation_data_container_split)) return splits
def fit_trial_split(self, trial_split: TrialSplit, train_data_container: DataContainer, validation_data_container: DataContainer, context: ExecutionContext) -> TrialSplit: """ Train pipeline using the training data container. Track training, and validation metrics for each epoch. :param train_data_container: train data container :param validation_data_container: validation data container :param trial_split: trial to execute :param context: execution context :return: executed trial """ early_stopping = False for i in range(self.epochs): self.print_func('\nepoch {}/{}'.format(i + 1, self.epochs)) trial_split = trial_split.fit_trial_split( train_data_container.copy(), context) y_pred_train = trial_split.predict_with_pipeline( train_data_container.copy(), context) y_pred_val = trial_split.predict_with_pipeline( validation_data_container.copy(), context) if self.callbacks.call(trial=trial_split, epoch_number=i, total_epochs=self.epochs, input_train=train_data_container, pred_train=y_pred_train, input_val=validation_data_container, pred_val=y_pred_val, is_finished_and_fitted=early_stopping): break return trial_split
def _fit_data_container( self, data_container: DataContainer, context: ExecutionContext) -> (BaseStep, DataContainer): """ Handle fit by passing the data inputs, and the expected outputs to the wrapped step fit method. :param context: execution context :type context: ExecutionContext :param data_container: data container to fit on :return: self, data container :rtype: (BaseStep, DataContainer) """ self.wrapped = self.wrapped.handle_fit( DataContainer(data_inputs=(copy.copy(data_container.data_inputs), copy.copy( data_container.expected_outputs)), current_ids=data_container.current_ids, expected_outputs=None), context) data_container.set_data_inputs( (data_container.data_inputs, data_container.expected_outputs)) data_container.set_expected_outputs(expected_outputs=None) return self
def handle_fit_transform( self, data_container: DataContainer, context: ExecutionContext) -> ('BaseStep', DataContainer): """ Fit transform data container. :param context: execution context :param data_container: the data container to transform :type data_container: neuraxle.data_container.DataContainer :return: tuple(fitted pipeline, data_container) """ self.create_checkpoint_path(context.get_path()) self.flush_cache() self.wrapped = self.wrapped.fit(data_container.data_inputs, data_container.expected_outputs) outputs = self._transform_with_cache(data_container) data_container.set_data_inputs(outputs) current_ids = self.hash(data_container) data_container.set_current_ids(current_ids) return self, data_container
def test_outer_concatenate_data_should_merge_2d_with_3d(): # Given data_inputs_3d, expected_outputs_3d = _create_data_source(SHAPE_3D) data_inputs_2d, expected_outputs_2d = _create_data_source(SHAPE_2D) data_container_2d = DataContainer(data_inputs=data_inputs_2d, expected_outputs=expected_outputs_2d) data_container = DataContainer(data_inputs=data_inputs_3d, expected_outputs=expected_outputs_3d) \ .add_sub_data_container('2d', data_container_2d) # When p = Pipeline([ZipBatchDataContainer(sub_data_container_names=['2d'])]) data_container = p.handle_transform(data_container, ExecutionContext()) # Then for i, (first_di, second_di) in enumerate(zip(data_inputs_3d, data_inputs_2d)): assert np.array_equal(data_container.data_inputs[i][0], first_di) assert np.array_equal(data_container.data_inputs[i][1], second_di) for i, (first_eo, second_eo) in enumerate( zip(expected_outputs_3d, expected_outputs_2d)): assert np.array_equal(data_container.expected_outputs[i][0], first_eo) assert np.array_equal(data_container.expected_outputs[i][1], second_eo)
def test_output_transformer_wrapper_should_fit_transform_with_data_inputs_and_expected_outputs(): tape = TapeCallbackFunction() p = OutputTransformerWrapper(Pipeline([MultiplyByN(2), FitCallbackStep(tape)])) data_inputs, expected_outputs = _create_data_source((10, 10)) p, data_container = p.handle_fit_transform(DataContainer( data_inputs=data_inputs, expected_outputs=expected_outputs ), ExecutionContext()) assert np.array_equal(data_container.data_inputs, data_inputs) assert np.array_equal(data_container.expected_outputs, expected_outputs * 2) assert np.array_equal(tape.data[0][0], expected_outputs * 2) for i in range(10): assert tape.data[0][1][i] is None
def fit_transform(self, data_inputs, expected_outputs=None) -> ('Pipeline', Any): """ After loading the last checkpoint, fit transform each pipeline steps :param data_inputs: the data input to fit on :param expected_outputs: the expected data output to fit on :return: the pipeline itself """ new_self, data_container = self.fit_transform_data_container( DataContainer(data_inputs=data_inputs, current_ids=None, expected_outputs=expected_outputs)) return new_self, data_container.data_inputs
def transform(self, data_inputs: Any): """ After loading the last checkpoint, transform each pipeline steps :param data_inputs: the data input to transform :return: transformed data inputs """ data_container = DataContainer(current_ids=None, data_inputs=data_inputs) data_container = self.hash_data_container(data_container) context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.TRANSFORM) context = context.push(self) data_container = self._transform_data_container(data_container, context) return data_container.data_inputs
def test_flatten_for_each_should_transform_data_inputs_and_expected_outputs(): p = FlattenForEach(Pipeline([ MultiplyByN(2), OutputTransformerWrapper(MultiplyByN(3)) ])) # TODO: should use a tape here and ensure that the MultiplyByN received a flat 12 shape only once and not 3*4 things data_inputs, expected_outputs = _create_random_of_shape(DATA_SHAPE) p, outputs = p.handle_fit_transform( DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs), ExecutionContext()) assert np.array(outputs.data_inputs).shape == DATA_SHAPE assert np.array_equal(outputs.data_inputs, data_inputs * 2) assert np.array(outputs.expected_outputs).shape == DATA_SHAPE assert np.array_equal(outputs.expected_outputs, expected_outputs * 3)
def join_transform(self, step: TruncableSteps, data_container: DataContainer, context: ExecutionContext) -> ZipDataContainer: context = context.push(step) data_container_batches = data_container.minibatches( batch_size=self.batch_size, keep_incomplete_batch=self.keep_incomplete_batch, default_value_data_inputs=self.default_value_data_inputs, default_value_expected_outputs=self.default_value_expected_outputs ) output_data_container = [] for data_container_batch in data_container_batches: output_data_container.append(step._transform_data_container(data_container_batch, context)) return ZipDataContainer.create_from(*output_data_container)
def test_handle_predict_should_predict_in_test_mode(): tape_fit = TapeCallbackFunction() tape_transform = TapeCallbackFunction() p = Pipeline([ TestOnlyWrapper( CallbackWrapper(MultiplyByN(2), tape_transform, tape_fit)), TrainOnlyWrapper( CallbackWrapper(MultiplyByN(4), tape_transform, tape_fit)) ]) data_container = p.handle_predict(data_container=DataContainer( data_inputs=np.array([1, 1]), expected_outputs=np.array([1, 1])), context=ExecutionContext()) assert np.array_equal(data_container.data_inputs, np.array([2, 2]))
def _fit_data_container(self, data_container: DataContainer, context: ExecutionContext) -> 'BaseStep': """ Fit wrapped step self.epochs times using wrapped step handle fit method. :param data_container: data container :type data_container: DataContainer :param context: execution context :type context: ExecutionContext :return: (fitted self, data container) :rtype: (BaseStep, DataContainer) """ epochs = self._get_epochs() for _ in range(epochs): self.wrapped = self.wrapped.handle_fit(data_container.copy(), context) return self
def join_fit_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> \ Tuple['Any', DataContainer]: context = context.push(step) data_container_batches = data_container.minibatches( batch_size=self.batch_size, keep_incomplete_batch=self.keep_incomplete_batch, default_value_data_inputs=self.default_value_data_inputs, default_value_expected_outputs=self.default_value_expected_outputs ) output_data_container = [] for data_container_batch in data_container_batches: step, data_container_batch = step._fit_transform_data_container(data_container_batch, context) output_data_container.append(data_container_batch) return step, ZipDataContainer.create_from(*output_data_container)
def fit_transform(self, data_inputs, expected_outputs=None) -> ('Pipeline', Any): """ :param data_inputs: the data input to fit on :param expected_outputs: the expected data output to fit on :return: the pipeline itself """ self.setup() data_container = DataContainer(current_ids=None, data_inputs=data_inputs, expected_outputs=expected_outputs) data_container = self.hash_data_container(data_container) context = ExecutionContext(self.cache_folder, ExecutionMode.FIT_TRANSFORM) new_self, data_container = self.handle_fit_transform(data_container, context) return new_self, data_container.data_inputs
def test_joblib_checkpoint_step_should_load_data_container(tmpdir: LocalPath): initial_data_inputs = [1, 2] initial_expected_outputs = [2, 3] create_pipeline_output_transformer(tmpdir).fit_transform( data_inputs=initial_data_inputs, expected_outputs=initial_expected_outputs) actual_data_container = create_pipeline_output_transformer( tmpdir).handle_transform( DataContainer(data_inputs=initial_data_inputs, current_ids=[0, 1], expected_outputs=initial_expected_outputs), ExecutionContext(tmpdir)) assert np.array_equal(actual_data_container.data_inputs, [4, 8]) assert np.array_equal(actual_data_container.expected_outputs, [8, 12])
def test_queued_pipeline_with_step_with_threading(): p = SequentialQueuedPipeline( [MultiplyByN(2), MultiplyByN(2), MultiplyByN(2), MultiplyByN(2)], batch_size=10, n_workers_per_step=1, max_queue_size=5, use_processes=False) data_container = DataContainer(data_inputs=list(range(100))) context = ExecutionContext() outputs = p.handle_transform(data_container, context) assert np.array_equal(outputs.data_inputs, EXPECTED_OUTPUTS)
def test_logger_automl(self, tmpdir): # Given context = ExecutionContext() self.tmpdir = str(tmpdir) hp_repository = HyperparamsJSONRepository(cache_folder=self.tmpdir) n_epochs = 2 n_trials = 4 auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace( {'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), LoggingStep() ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy( ), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), n_trials=n_trials, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, continue_loop_on_error=False) # When data_container = DataContainer( data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), expected_outputs=np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])) auto_ml.handle_fit(data_container, context) # Then file_paths = [ os.path.join(hp_repository.cache_folder, f"trial_{i}.log") for i in range(n_trials) ] assert len(file_paths) == n_trials for f in file_paths: assert os.path.exists(f) for f in file_paths: with open(f, 'r') as f: log = f.readlines() assert len(log) == 36
def _transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ Handle transform by passing data_inputs, and expected outputs to the wrapped step transform method. Update the expected outputs with the outputs. :param context: execution context :param data_container: :return: data container :rtype: DataContainer """ output_data_container = self.wrapped.handle_transform( DataContainer(data_inputs=(data_container.data_inputs, data_container.expected_outputs), current_ids=data_container.current_ids, expected_outputs=None), context) return output_data_container
def _fit_transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> ( 'BaseStep', DataContainer): """ Fit transform wrapped step self.epochs times using wrapped step handle fit transform method. :param data_container: data container :type data_container: DataContainer :param context: execution context :type context: ExecutionContext :return: (fitted self, data container) :rtype: (BaseStep, DataContainer) """ if not self.fit_only: for _ in range(self.epochs - 1): self.wrapped = self.wrapped.handle_fit(data_container.copy(), context) self.wrapped, data_container = self.wrapped.handle_fit_transform(data_container, context) return self, data_container
def train(self, pipeline: BaseStep, data_inputs, expected_outputs=None, context: ExecutionContext = None, trial_number=0) -> Trial: """ Train pipeline using the validation splitter. Track training, and validation metrics for each epoch. Note: the present method is just a shortcut to using the `execute_trial` method with less boilerplate code needed. Refer to `execute_trial` for full flexibility :param pipeline: pipeline to train on :param data_inputs: data inputs :param expected_outputs: expected ouptuts to fit on :return: executed trial """ assert not ( context is None ) # TODO: change order of arguments so that context isn't an optional argument validation_splits: List[Tuple[ DataContainer, DataContainer]] = self.validation_split_function.split_data_container( DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs), context=context) repo_trial: Trial = Trial( pipeline=pipeline, logger=context.logger, hyperparams=pipeline.get_hyperparams(), main_metric_name=self.get_main_metric_name(), save_trial_function=self.hyperparams_repository.save_trial, trial_number=trial_number) self.execute_trial(pipeline=pipeline, repo_trial=repo_trial, context=context, validation_splits=validation_splits, n_trial=1, delete_pipeline_on_completion=False) return repo_trial
def _fit_transform_data_container( self, data_container: DataContainer, context: ExecutionContext) -> (BaseStep, DataContainer): """ Handle fit transform by passing the data inputs, and the expected outputs to the wrapped step fit method. Update the expected outputs with the outputs. :param context: execution context :type context: ExecutionContext :param data_container: data container to fit on :return: self, data container :rtype: (BaseStep, DataContainer) """ self.wrapped, output_data_container = self.wrapped.handle_fit_transform( DataContainer(data_inputs=(data_container.data_inputs, data_container.expected_outputs), current_ids=data_container.current_ids, expected_outputs=None), context) return self, output_data_container
def fit_transform(self, data_inputs, expected_outputs=None) -> ('Pipeline', Any): """ After loading the last checkpoint, fit transform each pipeline steps :param data_inputs: the data input to fit on :param expected_outputs: the expected data output to fit on :return: the pipeline itself """ data_container = DataContainer( current_ids=None, data_inputs=data_inputs, expected_outputs=expected_outputs ) data_container = self.hash_data_container(data_container) context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.FIT_TRANSFORM) context = context.push(self) new_self, data_container = self._fit_transform_data_container(data_container, context) return new_self, data_container.data_inputs