def split_data_container( self, data_container) -> Tuple[DataContainer, DataContainer]: """ Split data container into a training set, and a validation set. :param data_container: data container :type data_container: DataContainer :return: train_data_container, validation_data_container """ train_data_inputs, train_expected_outputs, validation_data_inputs, validation_expected_outputs = \ self.split(data_container.data_inputs, data_container.expected_outputs) train_ids = self.train_split(data_container.current_ids) train_data_container = DataContainer( data_inputs=train_data_inputs, current_ids=train_ids, summary_id=data_container.summary_id, expected_outputs=train_expected_outputs) validation_ids = self.validation_split(data_container.current_ids) validation_data_container = DataContainer( data_inputs=validation_data_inputs, current_ids=validation_ids, summary_id=data_container.summary_id, expected_outputs=validation_expected_outputs) return train_data_container, validation_data_container
def test_inner_concatenate_data_should_merge_1d_with_3d(): # Given data_inputs_3d, expected_outputs_3d = _create_data_source(SHAPE_3D) data_inputs_1d, expected_outputs_1d = _create_data_source(SHAPE_1D) data_container_1d = DataContainer(data_inputs=data_inputs_1d, expected_outputs=expected_outputs_1d) data_container = DataContainer(data_inputs=data_inputs_3d, expected_outputs=expected_outputs_3d) \ .add_sub_data_container('1d', data_container_1d) # When p = Pipeline( [InnerConcatenateDataContainer(sub_data_container_names=['1d'])]) data_container = p.handle_transform(data_container, ExecutionContext()) # Then broadcasted_data_inputs_1d = np.broadcast_to( np.expand_dims(data_container_1d.data_inputs, axis=-1), shape=(SHAPE_3D[0], SHAPE_3D[1])) broadcasted_expected_outputs_1d = np.broadcast_to( np.expand_dims(data_container_1d.expected_outputs, axis=-1), shape=(SHAPE_3D[0], SHAPE_3D[1])) assert np.array_equal(data_container.data_inputs[..., -1], broadcasted_data_inputs_1d) assert np.array_equal(data_container.expected_outputs[..., -1], broadcasted_expected_outputs_1d) assert data_container.data_inputs.shape == (SHAPE_3D[0], SHAPE_3D[1], SHAPE_3D[2] + 1) assert data_container.expected_outputs.shape == (SHAPE_3D[0], SHAPE_3D[1], SHAPE_3D[2] + 1)
def test_inner_concatenate_data_should_merge_2d_with_3d(): # Given data_inputs_3d, expected_outputs_3d = _create_data_source(SHAPE_3D) data_inputs_2d, expected_outputs_2d = _create_data_source(SHAPE_2D) data_container_2d = DataContainer(data_inputs=data_inputs_2d, expected_outputs=expected_outputs_2d) data_container_3d = DataContainer(data_inputs=data_inputs_3d, expected_outputs=expected_outputs_3d) \ .add_sub_data_container('2d', data_container_2d) # When p = Pipeline( [InnerConcatenateDataContainer(sub_data_container_names=['2d'])]) data_container_3d = p.handle_transform(data_container_3d, ExecutionContext()) # Then assert data_container_3d.data_inputs.shape == (SHAPE_3D[0], SHAPE_3D[1], SHAPE_3D[2] + 1) assert data_container_3d.expected_outputs.shape == (SHAPE_3D[0], SHAPE_3D[1], SHAPE_3D[2] + 1) assert np.array_equal(data_container_3d.data_inputs[..., -1], data_container_2d.data_inputs) assert np.array_equal(data_container_3d.expected_outputs[..., -1], data_container_2d.expected_outputs)
def split_data_container(self, data_container: DataContainer) -> List[Tuple[DataContainer, DataContainer]]: """ Wrap a validation split function with a split data container function. A validation split function takes two arguments: data inputs, and expected outputs. :param data_container: data container to split :return: a function that returns the pairs of training, and validation data containers for each validation split. """ train_data_inputs, train_expected_outputs, validation_data_inputs, validation_expected_outputs = self.split( data_inputs=data_container.data_inputs, expected_outputs=data_container.expected_outputs ) train_data_container = DataContainer(data_inputs=train_data_inputs, expected_outputs=train_expected_outputs) validation_data_container = DataContainer(data_inputs=validation_data_inputs, expected_outputs=validation_expected_outputs) splits = [] for (train_current_id, train_di, train_eo), (validation_current_id, validation_di, validation_eo) in zip( train_data_container, validation_data_container): train_data_container_split = DataContainer( summary_id=train_current_id, data_inputs=train_di, expected_outputs=train_eo ) validation_data_container_split = DataContainer( summary_id=validation_current_id, data_inputs=validation_di, expected_outputs=validation_eo ) splits.append((train_data_container_split, validation_data_container_split)) return splits
def split_data_container(self, data_container: DataContainer) -> Tuple[DataContainer, DataContainer]: train_data_inputs, train_expected_outputs, validation_data_inputs, validation_expected_outputs = self.split( data_container.data_inputs, data_container.expected_outputs ) train_data_container = DataContainer(data_inputs=train_data_inputs, expected_outputs=train_expected_outputs) validation_data_container = DataContainer(data_inputs=validation_data_inputs, expected_outputs=validation_expected_outputs) return train_data_container, validation_data_container
def test_zip_data_container_should_merge_two_data_sources_together(): data_inputs_3d, expected_outputs_3d = _create_data_source((10, 10, 2)) data_inputs_2d, expected_outputs_2d = _create_data_source((10, 10)) data_container_2d = DataContainer(data_inputs=data_inputs_2d, expected_outputs=expected_outputs_2d) data_container = DataContainer(data_inputs=data_inputs_3d, expected_outputs=expected_outputs_3d) zip_data_container = ZipDataContainer.create_from(data_container, data_container_2d) assert zip_data_container.current_ids == data_container.current_ids for i, di in enumerate(zip_data_container.data_inputs): assert np.array_equal(di[0], data_inputs_3d[i]) assert np.array_equal(di[1], data_inputs_2d[i])
def train(self, pipeline: BaseStep, data_inputs, expected_outputs=None) -> Trial: """ Train pipeline using the validation splitter. Track training, and validation metrics for each epoch. Note: the present method is just a shortcut to using the `execute_trial` method with less boilerplate code needed. Refer to `execute_trial` for full flexibility :param pipeline: pipeline to train on :param data_inputs: data inputs :param expected_outputs: expected ouptuts to fit on :return: executed trial """ validation_splits: List[Tuple[DataContainer, DataContainer]] = self.validation_split_function.split_data_container( DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs) ) repo_trial: Trial = Trial( pipeline=pipeline, hyperparams=pipeline.get_hyperparams(), main_metric_name=self.get_main_metric_name() ) self.execute_trial( pipeline=pipeline, trial_number=1, repo_trial=repo_trial, context=ExecutionContext(), validation_splits=validation_splits, n_trial=1, delete_pipeline_on_completion=False ) return repo_trial
def test_data_container_batching(batch_size, include_incomplete_pass, default_value, expected_data_containers): data_container = DataContainer(current_ids=[str(i) for i in range(10)], data_inputs=np.array(list(range(10))), expected_outputs=np.array( list(range(10, 20)))) # When data_containers = [] for dc in data_container.minibatches( batch_size=batch_size, include_incomplete_batch=include_incomplete_pass, default_value_data_inputs=default_value): data_containers.append(dc) # Then assert len(expected_data_containers) == len(data_containers) for expected_data_container, actual_data_container in zip( expected_data_containers, data_containers): np.array_equal(expected_data_container.current_ids, actual_data_container.current_ids) np.array_equal(expected_data_container.data_inputs, actual_data_container.data_inputs) np.array_equal(expected_data_container.expected_outputs, actual_data_container.expected_outputs)
def test_pickle_checkpoint_step_should_load_data_container(tmpdir: LocalPath): initial_data_inputs = [1, 2] initial_expected_outputs = [2, 3] create_pipeline_output_transformer = lambda: ResumablePipeline([ ('output_transformer_1', MultiplyBy2OutputTransformer()), ('pickle_checkpoint', DefaultCheckpoint()), ('output_transformer_2', MultiplyBy2OutputTransformer()), ], cache_folder =tmpdir) create_pipeline_output_transformer().fit_transform( data_inputs=initial_data_inputs, expected_outputs=initial_expected_outputs) transformer = create_pipeline_output_transformer() actual_data_container = transformer.handle_transform( DataContainer(current_ids=[0, 1], data_inputs=initial_data_inputs, expected_outputs=initial_expected_outputs), ExecutionContext.create_from_root(transformer, ExecutionMode.TRANSFORM, tmpdir)) assert np.array_equal(actual_data_container.data_inputs, [4, 8]) assert np.array_equal(actual_data_container.expected_outputs, [8, 12])
def test_kfold_cross_validation_should_split_data_properly_bug(): data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = np.array([0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40]) data_container = DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs) splitter = KFoldCrossValidationSplitter(k_fold=2) # When validation_splits = splitter.split_data_container(data_container) train_di, train_eo, validation_di, validation_eo = extract_validation_split_data( validation_splits) # Then assert len(train_di[0]) == 6 assert np.array_equal(np.array(train_di[0]), data_inputs[5:]) assert len(train_eo[0]) == 6 assert np.array_equal(np.array(train_eo[0]), expected_outputs[5:]) assert len(train_di[1]) == 5 assert np.array_equal(np.array(train_di[1]), data_inputs[:5]) assert len(train_eo[1]) == 5 assert np.array_equal(np.array(train_eo[1]), expected_outputs[:5]) assert len(validation_di[0]) == 5 assert np.array_equal(np.array(validation_di[0]), data_inputs[:5]) assert len(validation_eo[0]) == 5 assert np.array_equal(np.array(validation_eo[0]), expected_outputs[:5]) assert len(validation_di[1]) == 6 assert np.array_equal(np.array(validation_di[1]), data_inputs[5:]) assert len(validation_eo[1]) == 6 assert np.array_equal(validation_eo[1], expected_outputs[5:])
def test_logger(): file_path = "test.log" if os.path.exists(file_path): os.remove(file_path) # Given logger = logging.getLogger('test') file_handler = logging.FileHandler(file_path) file_handler.setLevel('DEBUG') logger.addHandler(file_handler) logger.setLevel('DEBUG') context = ExecutionContext(logger=logger) pipeline = Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), LoggingStep() ]) # When data_container = DataContainer( data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pipeline.handle_fit(data_container, context) # Then assert os.path.exists(file_path) with open(file_path) as f: l = f.read() # Teardown file_handler.close() os.remove(file_path)
def test_validation_splitter_should_split_data_properly(): # Given data_inputs = np.random.random((4, 2, 2048, 6)).astype(np.float32) expected_outputs = np.random.random((4, 2, 2048, 1)).astype(np.float32) splitter = ValidationSplitter(test_size=0.2) # When validation_splits = splitter.split_data_container( DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs)) train_di, train_eo, validation_di, validation_eo = extract_validation_split_data( validation_splits) train_di = train_di[0] train_eo = train_eo[0] validation_di = validation_di[0] validation_eo = validation_eo[0] # Then assert len(train_di) == 3 assert np.array_equal(np.array(train_di), data_inputs[0:3]) assert len(train_eo) == 3 assert np.array_equal(np.array(train_eo), expected_outputs[0:3]) assert len(validation_di) == 1 assert np.array_equal(validation_di[0], data_inputs[-1]) assert len(validation_eo) == 1 assert np.array_equal(validation_eo[0], expected_outputs[-1])
def test_list_data_container_concat(): # Given data_container = ListDataContainer( current_ids=[str(i) for i in range(100)], data_inputs=np.array(list(range(100))), expected_outputs=np.array(list(range(100, 200)))) # When data_container.concat( DataContainer(current_ids=[str(i) for i in range(100, 200)], data_inputs=np.array(list(range(100, 200))), expected_outputs=np.array(list(range(200, 300))))) # Then assert np.array_equal(np.array(data_container.current_ids), np.array(list(range(0, 200))).astype(np.str)) expected_data_inputs = np.array(list(range(0, 200))).astype(np.int) actual_data_inputs = np.array(data_container.data_inputs).astype(np.int) assert np.array_equal(actual_data_inputs, expected_data_inputs) expected_expected_outputs = np.array(list(range(100, 300))).astype(np.int) assert np.array_equal( np.array(data_container.expected_outputs).astype(np.int), expected_expected_outputs)
def handle_inverse_transform(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ Handle inverse transform by passing expected outputs to the wrapped step inverse transform method. Update the expected outputs with the outputs. :param context: execution context :param data_container: :return: data container :rtype: DataContainer """ new_expected_outputs_data_container = self.wrapped.handle_inverse_transform( DataContainer( current_ids=data_container.current_ids, data_inputs=data_container.expected_outputs, expected_outputs=None ), context.push(self.wrapped) ) data_container.set_expected_outputs(new_expected_outputs_data_container.data_inputs) current_ids = self.hash(data_container) data_container.set_current_ids(current_ids) return data_container
def test_zip_data_container_should_concatenate_inner_features(): data_inputs_3d, expected_outputs_3d = _create_data_source((10, 10, 2)) data_inputs_2d, expected_outputs_2d = _create_data_source((10, 10)) data_container_2d = DataContainer(data_inputs=data_inputs_2d, expected_outputs=expected_outputs_2d) data_container = DataContainer(data_inputs=data_inputs_3d, expected_outputs=expected_outputs_3d) zip_data_container = ZipDataContainer.create_from(data_container, data_container_2d) zip_data_container.concatenate_inner_features() assert np.array_equal( np.array(zip_data_container.data_inputs)[..., -1], data_container_2d.data_inputs) assert np.array_equal(np.array(zip_data_container.expected_outputs), expected_outputs_3d)
def transform(self, data_inputs: Any): """ After loading the last checkpoint, transform each pipeline steps :param data_inputs: the data input to transform :return: transformed data inputs """ data_container = self.transform_data_container(DataContainer(data_inputs=data_inputs, current_ids=None)) return data_container.data_inputs
def test_kfold_cross_validation_should_split_data_properly(): # Given data_inputs = np.random.random((4, 2, 2048, 6)).astype(np.float32) expected_outputs = np.random.random((4, 2, 2048, 1)).astype(np.float32) splitter = KFoldCrossValidationSplitter(k_fold=4) # When validation_splits = splitter.split_data_container( data_container=DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs), context=ExecutionContext() ) train_di, train_eo, validation_di, validation_eo = extract_validation_split_data(validation_splits) # Then assert len(train_di[0]) == 3 assert np.array_equal(np.array(train_di[0]), data_inputs[1:]) assert len(train_eo[0]) == 3 assert np.array_equal(np.array(train_eo[0]), expected_outputs[1:]) assert len(train_di[1]) == 3 assert np.array_equal(np.array(train_di[1]), np.concatenate((np.expand_dims(data_inputs[0], axis=0), data_inputs[2:]), axis=0)) assert len(train_eo[1]) == 3 assert np.array_equal(np.array(train_eo[1]), np.concatenate((np.expand_dims(expected_outputs[0], axis=0), expected_outputs[2:]), axis=0)) assert len(train_di[2]) == 3 assert np.array_equal(np.array(train_di[2]), np.concatenate((data_inputs[0:2], np.expand_dims(data_inputs[3], axis=0)), axis=0)) assert len(train_eo[2]) == 3 assert np.array_equal(np.array(train_eo[2]), np.concatenate((expected_outputs[0:2], np.expand_dims(expected_outputs[3], axis=0)), axis=0)) assert len(train_di[3]) == 3 assert np.array_equal(np.array(train_di[3]), data_inputs[0:3]) assert len(train_eo[3]) == 3 assert np.array_equal(np.array(train_eo[3]), expected_outputs[0:3]) assert len(validation_di[0]) == 1 assert np.array_equal(validation_di[0][0], data_inputs[0]) assert len(validation_eo[0]) == 1 assert np.array_equal(validation_eo[0][0], expected_outputs[0]) assert len(validation_di[1]) == 1 assert np.array_equal(validation_di[1][0], data_inputs[1]) assert len(validation_eo[1]) == 1 assert np.array_equal(validation_eo[1][0], expected_outputs[1]) assert len(validation_di[2]) == 1 assert np.array_equal(validation_di[2][0], data_inputs[2]) assert len(validation_eo[2]) == 1 assert np.array_equal(validation_eo[2][0], expected_outputs[2]) assert len(validation_di[3]) == 1 assert np.array_equal(validation_di[3][0], data_inputs[3]) assert len(validation_eo[3]) == 1 assert np.array_equal(validation_eo[3][0], expected_outputs[3])
def test_data_container_iter_method_should_iterate_with_none_expected_outputs( ): data_container = DataContainer(current_ids=[str(i) for i in range(100)], data_inputs=np.array(list(range(100))), expected_outputs=None) for i, (current_id, data_input, expected_outputs) in enumerate(data_container): assert data_input == i assert expected_outputs is None
def test_input_and_output_transformer_wrapper_should_not_return_a_different_amount_of_data_inputs_and_expected_outputs( ): with pytest.raises(AssertionError): p = InputAndOutputTransformerWrapper(ChangeLenDataInputs()) data_inputs, expected_outputs = _create_data_source((10, 10)) p.handle_transform( DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs), ExecutionContext())
def fit(self, data_inputs, expected_outputs=None) -> 'Pipeline': """ After loading the last checkpoint, fit each pipeline steps :param data_inputs: the data input to fit on :param expected_outputs: the expected data output to fit on :return: the pipeline itself """ return self.fit_data_container( DataContainer(data_inputs=data_inputs, current_ids=None, expected_outputs=expected_outputs))
def test_input_and_output_transformer_wrapper_should_raise_an_assertion_error_if_current_ids_have_not_been_resampled_correctly( ): with pytest.raises(AssertionError) as e: p = InputAndOutputTransformerWrapper( ChangeLenDataInputsAndExpectedOutputs()) data_inputs, expected_outputs = _create_data_source((10, 10)) p.handle_transform( DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs), ExecutionContext())
def fit_transform(self, data_inputs, expected_outputs=None) -> ('Pipeline', Any): """ After loading the last checkpoint, fit transform each pipeline steps :param data_inputs: the data input to fit on :param expected_outputs: the expected data output to fit on :return: the pipeline itself """ new_self, data_container = self.fit_transform_data_container( DataContainer(data_inputs=data_inputs, current_ids=None, expected_outputs=expected_outputs)) return new_self, data_container.data_inputs
def test_output_transformer_wrapper_should_transform_with_data_inputs_and_expected_outputs(): p = OutputTransformerWrapper(MultiplyByN(2)) data_inputs, expected_outputs = _create_data_source((10, 10)) data_container = p.handle_transform(DataContainer( data_inputs=data_inputs, expected_outputs=expected_outputs ), ExecutionContext()) assert np.array_equal(data_container.data_inputs, data_inputs) assert np.array_equal(data_container.expected_outputs, expected_outputs * 2)
def test_output_transformer_should_zip_data_input_and_expected_output_in_the_transformed_output( tmpdir: LocalPath): pipeline = Pipeline([MultiplyBy2OutputTransformer()]) pipeline, new_data_container = pipeline.handle_fit_transform( DataContainer(data_inputs=[1, 2, 3], current_ids=[0, 1, 2], expected_outputs=[2, 3, 4]), ExecutionContext(tmpdir)) assert new_data_container.data_inputs == [2, 4, 6] assert new_data_container.expected_outputs == [4, 6, 8]
def test_data_container_iter_method_should_iterate_with_none_current_ids(): data_container = DataContainer(data_inputs=np.array(list(range(100))), expected_outputs=np.array( list(range(100, 200)))).set_current_ids(None) for i, (current_id, data_input, expected_outputs) in enumerate(data_container): assert current_id is None assert data_input == i assert expected_outputs == i + 100
def test_fit_transform_input_and_output_transformer_mixin_with_hashlib_md5_value_hasher(): step, data_container = WindowTimeSeries().handle_fit_transform( data_container=DataContainer( data_inputs=np.array(list(range(10))), expected_outputs=np.array(list(range(10))) ), context=ExecutionContext() ) assert np.array_equal(data_container.data_inputs, np.array(list(range(0, 5)))) assert np.array_equal(data_container.expected_outputs, np.array(list(range(5, 10))))
def handle_fit(self, data_container: DataContainer, context: ExecutionContext) -> (BaseStep, DataContainer): self.wrapped = self.wrapped.handle_fit( DataContainer(current_ids=data_container.current_ids, data_inputs=data_container.expected_outputs, expected_outputs=None), context.push(self.wrapped)) current_ids = self.hash(data_container) data_container.set_current_ids(current_ids) return self, data_container
def plot_predictions(data_inputs, expected_outputs, pipeline, max_plotted_predictions): _, _, data_inputs_validation, expected_outputs_validation = \ pipeline.get_step_by_name('ValidationSplitWrapper').split(data_inputs, expected_outputs) pipeline.apply('toggle_plotting') pipeline.apply('set_max_plotted_predictions', max_plotted_predictions) signal_prediction_pipeline = pipeline.get_step_by_name('SignalPrediction') signal_prediction_pipeline.transform_data_container( DataContainer(data_inputs=data_inputs_validation, expected_outputs=expected_outputs_validation))
def test_data_doubler(): p = InputAndOutputTransformerWrapper(DoubleData()) data_inputs, expected_outputs = _create_data_source((10, 10)) out = p.handle_transform( DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs), ExecutionContext()) doubled_length = len(out.data_inputs) assert doubled_length == 2 * len(data_inputs) assert doubled_length == len(out.expected_outputs) assert doubled_length == len(out.current_ids)
def transform(self, data_inputs: Any): """ :param data_inputs: the data input to transform :return: transformed data inputs """ data_container = DataContainer(current_ids=None, data_inputs=data_inputs) self.hash_data_container(data_container) context = ExecutionContext(self.cache_folder, ExecutionMode.TRANSFORM) data_container = self.handle_transform(data_container, context) return data_container.data_inputs