Esempio n. 1
0
    def test_combine_samples_to_batch(self, padded,
                                      fix_batch_dimension=True):
        batch_size = 3
        if padded:
            inputs = {'input1': tf.placeholder(tf.float32, [10, None]),
                      'input2': tf.placeholder(tf.float32, [10, 2, None])}
        else:
            inputs = {'input1': tf.placeholder(tf.float32, [10, 5]),
                      'input2': tf.placeholder(tf.float32, [10, 2, 3])}
        dataset = Dataset(fix_batch_dimension=fix_batch_dimension).build()
        data = tf.data.Dataset.from_tensor_slices(inputs)
        data_batch = dataset.combine_samples_to_batch(data, batch_size)

        batch_size_in_shape = batch_size if fix_batch_dimension else None
        if padded:
            output_shapes_must = {
                'input1': [batch_size_in_shape, None],
                'input2': [batch_size_in_shape, 2, None]
            }
        else:
            output_shapes_must = {
                'input1': [batch_size_in_shape, 5],
                'input2': [batch_size_in_shape, 2, 3]
            }

        output_shapes_as_list = {
            k: each_shape.as_list()
            for k, each_shape in data_batch.output_shapes.items()}

        self.assertDictEqual(output_shapes_must,
                             output_shapes_as_list)
Esempio n. 2
0
 def test_repeat(self):
     data = tf.data.Dataset.from_tensor_slices(self.inputs)
     dataset = Dataset().build()
     data_repeat = dataset.repeat(data)
     outputs = _get_data_results(data_repeat, self.test_session(),
                                 max_iteration=self.number_of_samples * 5)
     outputs_repeated_must = {k: v * 5 for k, v in self.inputs.items()}
     self.assertDictEqual(outputs_repeated_must,
                          outputs)
Esempio n. 3
0
 def test_constructor(self):
     with self.assertRaises(ValueError):
         Dataset(number_of_shards=0, shard_index=0)
     with self.assertRaises(ValueError):
         Dataset(number_of_shards=2, shard_index=2)
     with self.assertRaises(ValueError):
         Dataset(shuffle_buffer_size=-1)
     with self.assertRaises(ValueError):
         Dataset(prefetch_buffer_size=-1)
     temp_dir = self.get_temp_dir()
     with self.assertRaises(FileNotFoundError):
         Dataset(cache_dir=os.path.join(temp_dir, 'cache1', 'cache2'))
Esempio n. 4
0
 def test_shard(self, number_of_shards=4, shard_index=2):
     data = tf.data.Dataset.from_tensor_slices(self.inputs)
     dataset = Dataset(number_of_shards=number_of_shards,
                       shard_index=shard_index).build()
     data_sharded = dataset.shard(data)
     outputs = _get_data_results(data_sharded, self.test_session())
     if number_of_shards == 1:
         outputs_must = self.inputs
     else:
         index_start = shard_index
         shard_step = number_of_shards
         outputs_must = {k: v[index_start:None:shard_step]
                         for k, v in self.inputs.items()}
     self.assertDictEqual(outputs_must, outputs)
Esempio n. 5
0
    def test_shuffle(self, shuffle_buffer_size):
        data = tf.data.Dataset.from_tensor_slices(self.inputs)
        dataset = Dataset(shuffle_buffer_size=shuffle_buffer_size).build()
        data_shuffled = dataset.shuffle(data)
        outputs = _get_data_results(data_shuffled, self.test_session())
        if shuffle_buffer_size == 1:
            self.assertDictEqual(self.inputs,
                                 outputs)
            return

        for each_key, each_list_value in self.inputs.items():
            self.assertNotEqual(each_list_value,
                                outputs[each_key])
            self.assertSetEqual(set(each_list_value),
                                set(outputs[each_key]))
Esempio n. 6
0
    def test_call(self):
        output_keys_mapping = {"processor1": {"data11": "data11_rm"}}
        processor1 = _DataGenerator1(name="processor1").build()
        processor2 = _DataGenerator2(name="processor2").build()
        data_pipe = DataPipe(processors=[processor1, processor2]
                             ).build()
        dataset = Dataset.from_data_pipe(
            data_pipe=data_pipe, random_seed=111,
            output_keys_mapping=output_keys_mapping).build()
        dataset.mode = "train"
        result = dataset(batch_size=4)
        result_iterator = result.make_initializable_iterator()
        batch = result_iterator.get_next()
        shapes = {k: v.as_list() for k, v in result.output_shapes.items()}
        shapes_must = {
            "data11_rm": [None, 10, 5],
            "data12": [None, 1],
            "data21": [None, 1, 3],
            "data22": [None, 5]
        }
        self.assertAllEqual(shapes_must,
                            shapes)
        self.assertSetEqual(set(shapes_must),
                            set(dataset.generated_keys_all))

        self.assertEqual(111,
                         dataset.random_seed)
        self.assertIsInstance(dataset,
                              Dataset)
        self.evaluate(result_iterator.initializer)
        _ = self.evaluate(batch)
Esempio n. 7
0
def _build_single_dataset(dataset_config) -> Dataset:
    """
    Build single dataset based on its config

    Parameters
    ----------
    dataset_config
        dataset config

    Returns
    -------
    dataset
        dataset
    """
    deprecated.replace_deprecated_parameter_in_config(
        "name", "subtype", dataset_config, required=False)

    if ("data_pipe" in dataset_config
            and "class_name" not in dataset_config):
        build_fn = lambda x: Dataset.from_data_pipe(**x).build()
    else:
        build_fn = None
    dataset = data_builder_lib.build_data_object_from_config(
        config=dataset_config, base_cls=Dataset,
        built_fn=build_fn)
    return dataset
Esempio n. 8
0
    def _get_dataset():
        np.random.seed(546547)
        dataset_size = 200
        inputs_np = {
            'data': np.random.randn(dataset_size, 100).astype(np.float32),
            'labels':
            np.random.randint(10, size=(dataset_size, )).astype(np.int64),
            'temp': np.ones(dataset_size, np.float32)
        }

        def read_data_element():
            data = tf.data.Dataset.from_tensor_slices(inputs_np)
            return data

        dataset = Dataset().build()
        dataset.create_initial_data = MagicMock(side_effect=read_data_element)
        return dataset
Esempio n. 9
0
    def test_call(self):
        file_name = self.tfrecords_file_name
        writer = tf.python_io.TFRecordWriter(file_name)
        for each_sample in self.data_tfrecords:
            write_tf_records(each_sample, file_name, writer, False)
        writer.close()

        output_keys_mapping = {"processor2": {"data1_p": "data2_p"}}
        tf.reset_default_graph()
        reader = TfRecordsDataReaderDummy(
            name="reader",
            file_list_keys_mapping={"data": "tfrecords"}).build()
        processor1 = _DataProcessorTF(
            inbound_nodes=["reader"], name="processor1",
            add_num=10.0,
        ).build()
        processor2 = _DataProcessorTF(
            inbound_nodes=["reader"], name="processor2",
            incoming_keys_mapping={"reader": {"data2": "data1"}},
            add_num=-20.0,
        ).build()
        data_pipe = DataPipe(readers=[reader],
                             processors=[processor1, processor2]).build()

        file_list = FileList.from_matched_file_names(
            {"tfrecords": [self.tfrecords_file_name]})
        dataset = Dataset.from_data_pipe(
            data_pipe=data_pipe, file_list=file_list,
            output_keys_mapping=output_keys_mapping,
            random_seed=111).build()
        dataset.mode = "train"
        result = dataset(batch_size=4)
        result_iterator = result.make_initializable_iterator()
        batch = result_iterator.get_next()

        shapes = {k: v.as_list() for k, v in result.output_shapes.items()}
        shapes_must = {
            "data_default": [None, 1],
            "data1": [None, 1],
            "data2": [None, None, 20],
            "data3": [None, None, 1],
            "data1_p": [None, 1],
            "data2_p": [None, None, 20]
        }
        self.assertAllEqual(shapes_must,
                            shapes)
        self.assertSetEqual(set(shapes_must),
                            set(dataset.generated_keys_all))

        self.assertEqual(111,
                         dataset.random_seed)
        self.assertIsInstance(dataset,
                              DatasetTfRecords)
        self.evaluate(result_iterator.initializer)
        _ = self.evaluate(batch)
Esempio n. 10
0
    def test_initialize_session(self):
        output_keys_mapping = {"processor1": {"data11": "data11_rm"}}
        processor1 = _DataGenerator1(name="processor1").build()
        processor2 = _DataGenerator2(name="processor2").build()
        processor1.initialize_session = MagicMock(return_value=None)
        processor2.initialize_session = MagicMock(return_value=None)
        data_pipe = DataPipe(processors=[processor1, processor2]
                             ).build()
        dataset = Dataset.from_data_pipe(
            data_pipe=data_pipe, random_seed=111,
            output_keys_mapping=output_keys_mapping).build()
        dataset.initialize_session()

        processor1.initialize_session.assert_called_once_with()
        processor2.initialize_session.assert_called_once_with()
Esempio n. 11
0
    def test_call(self):
        output_keys_mapping = {"processor2": {"data1_p": "data2_p"}}
        reader_tf = DataReaderDummyTF(name="reader").build()
        processor1 = _DataProcessorTF(name="processor1",
                                      inbound_nodes=["reader"]).build()
        processor2 = _DataProcessorTF(
            name="processor2",
            inbound_nodes=["reader"],
            incoming_keys_mapping={"reader": {"data2": "data1"}}
        ).build()
        data_pipe = DataPipe(processors=[processor1, processor2],
                             readers=reader_tf
                             ).build()
        file_list = FileList.from_matched_file_names(
            self.file_names_with_floats)
        dataset = Dataset.from_data_pipe(
            data_pipe=data_pipe, file_list=file_list,
            output_keys_mapping=output_keys_mapping,
            random_seed=111).build()
        dataset.mode = "train"
        result = dataset(batch_size=4)
        result_iterator = result.make_initializable_iterator()
        batch = result_iterator.get_next()

        shapes = {k: v.as_list() for k, v in result.output_shapes.items()}
        shapes_must = {
            "data1": [None],
            "data2": [None],
            "data1_p": [None],
            "data2_p": [None]
        }
        self.assertAllEqual(shapes_must,
                            shapes)
        self.assertSetEqual(set(shapes_must),
                            set(dataset.generated_keys_all))

        self.assertEqual(111,
                         dataset.random_seed)
        self.assertIsInstance(dataset,
                              DatasetFileList)
        self.evaluate(result_iterator.initializer)
        _ = self.evaluate(batch)
Esempio n. 12
0
    def test_create_batch(self, prefetch_buffer_size):
        batch_size = 3
        data = tf.data.Dataset.from_tensor_slices(self.inputs)
        data_batch = data.batch(batch_size)

        def _create_features_for_single_sample():
            return data

        def _combine_samples_to_batch(data, batch_size):
            return data_batch

        def _prefetch(data):
            return data

        dataset = Dataset(prefetch_buffer_size=prefetch_buffer_size).build()
        dataset.create_features_for_single_sample = MagicMock(
            side_effect=_create_features_for_single_sample)
        dataset.combine_samples_to_batch = MagicMock(
            side_effect=_combine_samples_to_batch)
        dataset.prefetch = MagicMock(side_effect=_prefetch)
        dataset.mode = 'train'

        create_batch_calls_mock = Mock()
        create_batch_calls_mock.attach_mock(
            dataset.create_features_for_single_sample,
            'create_features_for_single_sample_call')
        create_batch_calls_mock.attach_mock(
            dataset.combine_samples_to_batch,
            'combine_samples_to_batch_call')
        create_batch_calls_mock.attach_mock(
            dataset.prefetch,
            'prefetch_call')

        data_batch = dataset.create_batch(batch_size)

        create_batch_expected_calls = [
            mock_call.create_features_for_single_sample_call(),
            mock_call.combine_samples_to_batch_call(data, batch_size)
        ]
        if prefetch_buffer_size > 0:
            create_batch_expected_calls.append(
                mock_call.prefetch_call(data_batch))

        create_batch_calls_mock.assert_has_calls(create_batch_expected_calls)
Esempio n. 13
0
    def test_cache_and_clear_cache(self):
        cache_dir = os.path.join(self.get_temp_dir(), 'cache')
        cache_file_name_pref_must = "-".join(['Dataset', 'stype', 'train'])

        data = tf.data.Dataset.from_tensor_slices(self.inputs)
        dataset = Dataset(cache_dir=cache_dir, subtype='stype').build()
        dataset.mode = 'train'
        data_cached = dataset.cache(data)
        outputs = _get_data_results(data_cached, self.test_session())
        cache_file_name = dataset._cache_file

        self.assertTrue(os.path.split(cache_file_name)[-1].startswith(
            cache_file_name_pref_must))
        self.assertDictEqual(self.inputs,
                             outputs)
        self.assertTrue(os.path.isfile(cache_file_name + '.index'))

        dataset.clear_cache()
        self.assertFalse(os.path.isfile(cache_file_name + '.index'))
Esempio n. 14
0
    def test_create_features_for_single_sample(self, mode, number_of_shards=1,
                                               with_cache=False):
        def _create_initial_data():
            return self.inputs

        def _extract_features_from_initial_data(inputs):
            return _add_suffix_to_each_value_in_dict_of_lists(
                inputs, '_features')

        def _filter_sample(inputs):
            return _add_suffix_to_each_value_in_dict_of_lists(
                inputs, '_filtered')

        def _shard(inputs):
            return _add_suffix_to_each_value_in_dict_of_lists(
                inputs, '_sharded')

        def _cache(inputs):
            return _add_suffix_to_each_value_in_dict_of_lists(
                inputs, '_cached')

        def _shuffle(inputs):
            return _add_suffix_to_each_value_in_dict_of_lists(
                inputs, '_shuffled')

        def _repeat(inputs):
            return _add_suffix_to_each_value_in_dict_of_lists(
                inputs, '_repeated')

        if with_cache:
            cache_dir = os.path.join(self.get_temp_dir(), 'cache')
        else:
            cache_dir = None
        dataset = Dataset(number_of_shards=number_of_shards,
                          cache_dir=cache_dir).build()
        dataset.mode = mode
        dataset.create_initial_data = MagicMock(
            side_effect=_create_initial_data)
        dataset.shard = MagicMock(
            side_effect=_shard)
        dataset.extract_features_from_initial_data = MagicMock(
            side_effect=_extract_features_from_initial_data)
        dataset.cache = MagicMock(side_effect=_cache)
        dataset.shuffle = MagicMock(side_effect=_shuffle)
        dataset.repeat = MagicMock(side_effect=_repeat)
        dataset.add_data_filter(_DummyDataFilterOddData())
        dataset.filter_sample = MagicMock(side_effect=_filter_sample)

        outputs = dataset.create_features_for_single_sample()

        outputs_must = {k: v for k, v in self.inputs.items()}
        if number_of_shards > 1:
            outputs_must = _add_suffix_to_each_value_in_dict_of_lists(
                outputs_must, '_sharded')
        outputs_must = _add_suffix_to_each_value_in_dict_of_lists(
            outputs_must, '_features')
        if with_cache:
            outputs_must = _add_suffix_to_each_value_in_dict_of_lists(
                outputs_must, '_cached')
        if mode == 'train':
            outputs_must = _add_suffix_to_each_value_in_dict_of_lists(
                outputs_must, '_shuffled')
        outputs_must = _add_suffix_to_each_value_in_dict_of_lists(
            outputs_must, '_repeated')
        outputs_must = _add_suffix_to_each_value_in_dict_of_lists(
            outputs_must, '_filtered')
        self.assertDictEqual(outputs_must,
                             outputs)
Esempio n. 15
0
    def test_build_data_object_from_config(self,
                                           with_file_list,
                                           with_data_filter,
                                           with_data_pipe=False,
                                           with_file_list_mapping=False):
        config_object = {"random_seed": 65477}
        if with_data_pipe:
            built_fn = lambda x: Dataset.from_data_pipe(**x).build()
            dataset_base_cls = Dataset
        else:
            if with_file_list:
                register_new_class("dummy_dataset", _DummyDatasetFileList)
                dataset_base_cls = _DummyDatasetFileList
                built_fn = None
            else:
                register_new_class("dummy_dataset", _DummyDataset)
                dataset_base_cls = _DummyDataset
                built_fn = None

            config_object.update({"class_name": "dummy_dataset", "p1": 100})
        file_list_keys_mapping = {"key1": "key1_r"}
        if with_file_list:
            config_object["file_list"] = {
                "class_name": "file_list1",
                "file_names": {
                    "key1": ["value1"]
                },
                "name": "file_list_name"
            }
            _DummyDatasetFileList.file_list_keys = ["key1"]
            if with_file_list_mapping:
                config_object["file_list_keys_mapping"] = {"key1": "key1_r"}
                _DummyDatasetFileList.file_list_keys = ["key1_r"]
        if with_data_filter:
            config_object["data_filter"] = {
                "class_name": "data_filter1",
                "dp1": 1
            }

        if with_data_pipe:
            reader_config1 = {"class_name": "reader1", "name": "reader1_name"}
            reader_config2 = {"class_name": "reader2", "name": "reader2_name"}
            processor_config1 = {
                "class_name": "processor1",
                "name": "processor1_name"
            }
            processor_config2 = {
                "class_name": "processor2",
                "name": "processor2_name"
            }
            if with_file_list:
                config_object["data_pipe"] = [
                    processor_config1, reader_config1, reader_config2,
                    processor_config2
                ]
            else:
                config_object["data_pipe"] = [
                    processor_config1, processor_config2
                ]
        if with_data_pipe and with_file_list_mapping:
            with self.assertRaises(ValueError):
                data_builder_lib.build_data_object_from_config(
                    config_object,
                    base_cls=dataset_base_cls,
                    built_fn=built_fn)
            return

        built_object = data_builder_lib.build_data_object_from_config(
            config_object, base_cls=dataset_base_cls, built_fn=built_fn)
        self.assertTrue(built_object.built)
        self.assertEqual(65477, built_object.random_seed)

        self.assertIsInstance(built_object, dataset_base_cls)
        if with_data_filter:
            self.assertEqual(1, len(built_object.data_filters))
            data_filter = built_object.data_filters[0]
            self.assertTrue(data_filter.built)
            self.assertIsInstance(data_filter, _DummyDataFilter)
            self.assertEqual(1, data_filter.dp1)
        else:
            self.assertIsNone(built_object.data_filters)

        if with_file_list:
            file_list = built_object.file_list
            self.assertIsInstance(file_list, FileListDummy)
            self.assertTrue(file_list.built)
            self.assertEqual("file_list_name", file_list.name)
            file_names_must = ({
                "key1_r": ["value1"]
            } if with_file_list_mapping else {
                "key1": ["value1"]
            })
            self.assertDictEqual(file_names_must, file_list.get())
            if with_file_list_mapping:
                self.assertDictEqual(file_list_keys_mapping,
                                     built_object.file_list_keys_mapping)
            else:
                self.assertIsNone(built_object.file_list_keys_mapping)
        else:
            self.assertFalse(hasattr(built_object, "file_list"))
            self.assertFalse(hasattr(built_object, "file_list_keys_mapping"))

        if with_data_pipe:
            data_pipe = built_object.data_pipe
            self.assertIsInstance(data_pipe, DataPipe)
            self.assertTrue(data_pipe.built)
            if with_file_list:
                self.assertSetEqual(
                    {"reader1_name", "reader2_name"},
                    {r.name
                     for r in data_pipe.readers.values()})
            else:
                self.assertDictEqual({}, data_pipe.readers)
            self.assertSetEqual(
                {"processor1_name", "processor2_name"},
                {r.name
                 for r in data_pipe.processors.values()})