def test_combine_samples_to_batch(self, padded, fix_batch_dimension=True): batch_size = 3 if padded: inputs = {'input1': tf.placeholder(tf.float32, [10, None]), 'input2': tf.placeholder(tf.float32, [10, 2, None])} else: inputs = {'input1': tf.placeholder(tf.float32, [10, 5]), 'input2': tf.placeholder(tf.float32, [10, 2, 3])} dataset = Dataset(fix_batch_dimension=fix_batch_dimension).build() data = tf.data.Dataset.from_tensor_slices(inputs) data_batch = dataset.combine_samples_to_batch(data, batch_size) batch_size_in_shape = batch_size if fix_batch_dimension else None if padded: output_shapes_must = { 'input1': [batch_size_in_shape, None], 'input2': [batch_size_in_shape, 2, None] } else: output_shapes_must = { 'input1': [batch_size_in_shape, 5], 'input2': [batch_size_in_shape, 2, 3] } output_shapes_as_list = { k: each_shape.as_list() for k, each_shape in data_batch.output_shapes.items()} self.assertDictEqual(output_shapes_must, output_shapes_as_list)
def test_repeat(self): data = tf.data.Dataset.from_tensor_slices(self.inputs) dataset = Dataset().build() data_repeat = dataset.repeat(data) outputs = _get_data_results(data_repeat, self.test_session(), max_iteration=self.number_of_samples * 5) outputs_repeated_must = {k: v * 5 for k, v in self.inputs.items()} self.assertDictEqual(outputs_repeated_must, outputs)
def test_constructor(self): with self.assertRaises(ValueError): Dataset(number_of_shards=0, shard_index=0) with self.assertRaises(ValueError): Dataset(number_of_shards=2, shard_index=2) with self.assertRaises(ValueError): Dataset(shuffle_buffer_size=-1) with self.assertRaises(ValueError): Dataset(prefetch_buffer_size=-1) temp_dir = self.get_temp_dir() with self.assertRaises(FileNotFoundError): Dataset(cache_dir=os.path.join(temp_dir, 'cache1', 'cache2'))
def test_shard(self, number_of_shards=4, shard_index=2): data = tf.data.Dataset.from_tensor_slices(self.inputs) dataset = Dataset(number_of_shards=number_of_shards, shard_index=shard_index).build() data_sharded = dataset.shard(data) outputs = _get_data_results(data_sharded, self.test_session()) if number_of_shards == 1: outputs_must = self.inputs else: index_start = shard_index shard_step = number_of_shards outputs_must = {k: v[index_start:None:shard_step] for k, v in self.inputs.items()} self.assertDictEqual(outputs_must, outputs)
def test_shuffle(self, shuffle_buffer_size): data = tf.data.Dataset.from_tensor_slices(self.inputs) dataset = Dataset(shuffle_buffer_size=shuffle_buffer_size).build() data_shuffled = dataset.shuffle(data) outputs = _get_data_results(data_shuffled, self.test_session()) if shuffle_buffer_size == 1: self.assertDictEqual(self.inputs, outputs) return for each_key, each_list_value in self.inputs.items(): self.assertNotEqual(each_list_value, outputs[each_key]) self.assertSetEqual(set(each_list_value), set(outputs[each_key]))
def test_call(self): output_keys_mapping = {"processor1": {"data11": "data11_rm"}} processor1 = _DataGenerator1(name="processor1").build() processor2 = _DataGenerator2(name="processor2").build() data_pipe = DataPipe(processors=[processor1, processor2] ).build() dataset = Dataset.from_data_pipe( data_pipe=data_pipe, random_seed=111, output_keys_mapping=output_keys_mapping).build() dataset.mode = "train" result = dataset(batch_size=4) result_iterator = result.make_initializable_iterator() batch = result_iterator.get_next() shapes = {k: v.as_list() for k, v in result.output_shapes.items()} shapes_must = { "data11_rm": [None, 10, 5], "data12": [None, 1], "data21": [None, 1, 3], "data22": [None, 5] } self.assertAllEqual(shapes_must, shapes) self.assertSetEqual(set(shapes_must), set(dataset.generated_keys_all)) self.assertEqual(111, dataset.random_seed) self.assertIsInstance(dataset, Dataset) self.evaluate(result_iterator.initializer) _ = self.evaluate(batch)
def _build_single_dataset(dataset_config) -> Dataset: """ Build single dataset based on its config Parameters ---------- dataset_config dataset config Returns ------- dataset dataset """ deprecated.replace_deprecated_parameter_in_config( "name", "subtype", dataset_config, required=False) if ("data_pipe" in dataset_config and "class_name" not in dataset_config): build_fn = lambda x: Dataset.from_data_pipe(**x).build() else: build_fn = None dataset = data_builder_lib.build_data_object_from_config( config=dataset_config, base_cls=Dataset, built_fn=build_fn) return dataset
def _get_dataset(): np.random.seed(546547) dataset_size = 200 inputs_np = { 'data': np.random.randn(dataset_size, 100).astype(np.float32), 'labels': np.random.randint(10, size=(dataset_size, )).astype(np.int64), 'temp': np.ones(dataset_size, np.float32) } def read_data_element(): data = tf.data.Dataset.from_tensor_slices(inputs_np) return data dataset = Dataset().build() dataset.create_initial_data = MagicMock(side_effect=read_data_element) return dataset
def test_call(self): file_name = self.tfrecords_file_name writer = tf.python_io.TFRecordWriter(file_name) for each_sample in self.data_tfrecords: write_tf_records(each_sample, file_name, writer, False) writer.close() output_keys_mapping = {"processor2": {"data1_p": "data2_p"}} tf.reset_default_graph() reader = TfRecordsDataReaderDummy( name="reader", file_list_keys_mapping={"data": "tfrecords"}).build() processor1 = _DataProcessorTF( inbound_nodes=["reader"], name="processor1", add_num=10.0, ).build() processor2 = _DataProcessorTF( inbound_nodes=["reader"], name="processor2", incoming_keys_mapping={"reader": {"data2": "data1"}}, add_num=-20.0, ).build() data_pipe = DataPipe(readers=[reader], processors=[processor1, processor2]).build() file_list = FileList.from_matched_file_names( {"tfrecords": [self.tfrecords_file_name]}) dataset = Dataset.from_data_pipe( data_pipe=data_pipe, file_list=file_list, output_keys_mapping=output_keys_mapping, random_seed=111).build() dataset.mode = "train" result = dataset(batch_size=4) result_iterator = result.make_initializable_iterator() batch = result_iterator.get_next() shapes = {k: v.as_list() for k, v in result.output_shapes.items()} shapes_must = { "data_default": [None, 1], "data1": [None, 1], "data2": [None, None, 20], "data3": [None, None, 1], "data1_p": [None, 1], "data2_p": [None, None, 20] } self.assertAllEqual(shapes_must, shapes) self.assertSetEqual(set(shapes_must), set(dataset.generated_keys_all)) self.assertEqual(111, dataset.random_seed) self.assertIsInstance(dataset, DatasetTfRecords) self.evaluate(result_iterator.initializer) _ = self.evaluate(batch)
def test_initialize_session(self): output_keys_mapping = {"processor1": {"data11": "data11_rm"}} processor1 = _DataGenerator1(name="processor1").build() processor2 = _DataGenerator2(name="processor2").build() processor1.initialize_session = MagicMock(return_value=None) processor2.initialize_session = MagicMock(return_value=None) data_pipe = DataPipe(processors=[processor1, processor2] ).build() dataset = Dataset.from_data_pipe( data_pipe=data_pipe, random_seed=111, output_keys_mapping=output_keys_mapping).build() dataset.initialize_session() processor1.initialize_session.assert_called_once_with() processor2.initialize_session.assert_called_once_with()
def test_call(self): output_keys_mapping = {"processor2": {"data1_p": "data2_p"}} reader_tf = DataReaderDummyTF(name="reader").build() processor1 = _DataProcessorTF(name="processor1", inbound_nodes=["reader"]).build() processor2 = _DataProcessorTF( name="processor2", inbound_nodes=["reader"], incoming_keys_mapping={"reader": {"data2": "data1"}} ).build() data_pipe = DataPipe(processors=[processor1, processor2], readers=reader_tf ).build() file_list = FileList.from_matched_file_names( self.file_names_with_floats) dataset = Dataset.from_data_pipe( data_pipe=data_pipe, file_list=file_list, output_keys_mapping=output_keys_mapping, random_seed=111).build() dataset.mode = "train" result = dataset(batch_size=4) result_iterator = result.make_initializable_iterator() batch = result_iterator.get_next() shapes = {k: v.as_list() for k, v in result.output_shapes.items()} shapes_must = { "data1": [None], "data2": [None], "data1_p": [None], "data2_p": [None] } self.assertAllEqual(shapes_must, shapes) self.assertSetEqual(set(shapes_must), set(dataset.generated_keys_all)) self.assertEqual(111, dataset.random_seed) self.assertIsInstance(dataset, DatasetFileList) self.evaluate(result_iterator.initializer) _ = self.evaluate(batch)
def test_create_batch(self, prefetch_buffer_size): batch_size = 3 data = tf.data.Dataset.from_tensor_slices(self.inputs) data_batch = data.batch(batch_size) def _create_features_for_single_sample(): return data def _combine_samples_to_batch(data, batch_size): return data_batch def _prefetch(data): return data dataset = Dataset(prefetch_buffer_size=prefetch_buffer_size).build() dataset.create_features_for_single_sample = MagicMock( side_effect=_create_features_for_single_sample) dataset.combine_samples_to_batch = MagicMock( side_effect=_combine_samples_to_batch) dataset.prefetch = MagicMock(side_effect=_prefetch) dataset.mode = 'train' create_batch_calls_mock = Mock() create_batch_calls_mock.attach_mock( dataset.create_features_for_single_sample, 'create_features_for_single_sample_call') create_batch_calls_mock.attach_mock( dataset.combine_samples_to_batch, 'combine_samples_to_batch_call') create_batch_calls_mock.attach_mock( dataset.prefetch, 'prefetch_call') data_batch = dataset.create_batch(batch_size) create_batch_expected_calls = [ mock_call.create_features_for_single_sample_call(), mock_call.combine_samples_to_batch_call(data, batch_size) ] if prefetch_buffer_size > 0: create_batch_expected_calls.append( mock_call.prefetch_call(data_batch)) create_batch_calls_mock.assert_has_calls(create_batch_expected_calls)
def test_cache_and_clear_cache(self): cache_dir = os.path.join(self.get_temp_dir(), 'cache') cache_file_name_pref_must = "-".join(['Dataset', 'stype', 'train']) data = tf.data.Dataset.from_tensor_slices(self.inputs) dataset = Dataset(cache_dir=cache_dir, subtype='stype').build() dataset.mode = 'train' data_cached = dataset.cache(data) outputs = _get_data_results(data_cached, self.test_session()) cache_file_name = dataset._cache_file self.assertTrue(os.path.split(cache_file_name)[-1].startswith( cache_file_name_pref_must)) self.assertDictEqual(self.inputs, outputs) self.assertTrue(os.path.isfile(cache_file_name + '.index')) dataset.clear_cache() self.assertFalse(os.path.isfile(cache_file_name + '.index'))
def test_create_features_for_single_sample(self, mode, number_of_shards=1, with_cache=False): def _create_initial_data(): return self.inputs def _extract_features_from_initial_data(inputs): return _add_suffix_to_each_value_in_dict_of_lists( inputs, '_features') def _filter_sample(inputs): return _add_suffix_to_each_value_in_dict_of_lists( inputs, '_filtered') def _shard(inputs): return _add_suffix_to_each_value_in_dict_of_lists( inputs, '_sharded') def _cache(inputs): return _add_suffix_to_each_value_in_dict_of_lists( inputs, '_cached') def _shuffle(inputs): return _add_suffix_to_each_value_in_dict_of_lists( inputs, '_shuffled') def _repeat(inputs): return _add_suffix_to_each_value_in_dict_of_lists( inputs, '_repeated') if with_cache: cache_dir = os.path.join(self.get_temp_dir(), 'cache') else: cache_dir = None dataset = Dataset(number_of_shards=number_of_shards, cache_dir=cache_dir).build() dataset.mode = mode dataset.create_initial_data = MagicMock( side_effect=_create_initial_data) dataset.shard = MagicMock( side_effect=_shard) dataset.extract_features_from_initial_data = MagicMock( side_effect=_extract_features_from_initial_data) dataset.cache = MagicMock(side_effect=_cache) dataset.shuffle = MagicMock(side_effect=_shuffle) dataset.repeat = MagicMock(side_effect=_repeat) dataset.add_data_filter(_DummyDataFilterOddData()) dataset.filter_sample = MagicMock(side_effect=_filter_sample) outputs = dataset.create_features_for_single_sample() outputs_must = {k: v for k, v in self.inputs.items()} if number_of_shards > 1: outputs_must = _add_suffix_to_each_value_in_dict_of_lists( outputs_must, '_sharded') outputs_must = _add_suffix_to_each_value_in_dict_of_lists( outputs_must, '_features') if with_cache: outputs_must = _add_suffix_to_each_value_in_dict_of_lists( outputs_must, '_cached') if mode == 'train': outputs_must = _add_suffix_to_each_value_in_dict_of_lists( outputs_must, '_shuffled') outputs_must = _add_suffix_to_each_value_in_dict_of_lists( outputs_must, '_repeated') outputs_must = _add_suffix_to_each_value_in_dict_of_lists( outputs_must, '_filtered') self.assertDictEqual(outputs_must, outputs)
def test_build_data_object_from_config(self, with_file_list, with_data_filter, with_data_pipe=False, with_file_list_mapping=False): config_object = {"random_seed": 65477} if with_data_pipe: built_fn = lambda x: Dataset.from_data_pipe(**x).build() dataset_base_cls = Dataset else: if with_file_list: register_new_class("dummy_dataset", _DummyDatasetFileList) dataset_base_cls = _DummyDatasetFileList built_fn = None else: register_new_class("dummy_dataset", _DummyDataset) dataset_base_cls = _DummyDataset built_fn = None config_object.update({"class_name": "dummy_dataset", "p1": 100}) file_list_keys_mapping = {"key1": "key1_r"} if with_file_list: config_object["file_list"] = { "class_name": "file_list1", "file_names": { "key1": ["value1"] }, "name": "file_list_name" } _DummyDatasetFileList.file_list_keys = ["key1"] if with_file_list_mapping: config_object["file_list_keys_mapping"] = {"key1": "key1_r"} _DummyDatasetFileList.file_list_keys = ["key1_r"] if with_data_filter: config_object["data_filter"] = { "class_name": "data_filter1", "dp1": 1 } if with_data_pipe: reader_config1 = {"class_name": "reader1", "name": "reader1_name"} reader_config2 = {"class_name": "reader2", "name": "reader2_name"} processor_config1 = { "class_name": "processor1", "name": "processor1_name" } processor_config2 = { "class_name": "processor2", "name": "processor2_name" } if with_file_list: config_object["data_pipe"] = [ processor_config1, reader_config1, reader_config2, processor_config2 ] else: config_object["data_pipe"] = [ processor_config1, processor_config2 ] if with_data_pipe and with_file_list_mapping: with self.assertRaises(ValueError): data_builder_lib.build_data_object_from_config( config_object, base_cls=dataset_base_cls, built_fn=built_fn) return built_object = data_builder_lib.build_data_object_from_config( config_object, base_cls=dataset_base_cls, built_fn=built_fn) self.assertTrue(built_object.built) self.assertEqual(65477, built_object.random_seed) self.assertIsInstance(built_object, dataset_base_cls) if with_data_filter: self.assertEqual(1, len(built_object.data_filters)) data_filter = built_object.data_filters[0] self.assertTrue(data_filter.built) self.assertIsInstance(data_filter, _DummyDataFilter) self.assertEqual(1, data_filter.dp1) else: self.assertIsNone(built_object.data_filters) if with_file_list: file_list = built_object.file_list self.assertIsInstance(file_list, FileListDummy) self.assertTrue(file_list.built) self.assertEqual("file_list_name", file_list.name) file_names_must = ({ "key1_r": ["value1"] } if with_file_list_mapping else { "key1": ["value1"] }) self.assertDictEqual(file_names_must, file_list.get()) if with_file_list_mapping: self.assertDictEqual(file_list_keys_mapping, built_object.file_list_keys_mapping) else: self.assertIsNone(built_object.file_list_keys_mapping) else: self.assertFalse(hasattr(built_object, "file_list")) self.assertFalse(hasattr(built_object, "file_list_keys_mapping")) if with_data_pipe: data_pipe = built_object.data_pipe self.assertIsInstance(data_pipe, DataPipe) self.assertTrue(data_pipe.built) if with_file_list: self.assertSetEqual( {"reader1_name", "reader2_name"}, {r.name for r in data_pipe.readers.values()}) else: self.assertDictEqual({}, data_pipe.readers) self.assertSetEqual( {"processor1_name", "processor2_name"}, {r.name for r in data_pipe.processors.values()})