def test_multi_split(self): with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir: ds_train, ds_test = registered.load( name="dummy_dataset_shared_generator", data_dir=tmp_dir, split=[splits_lib.Split.TRAIN, splits_lib.Split.TEST], as_dataset_kwargs=dict(shuffle_files=False)) data = list(dataset_utils.dataset_as_numpy(ds_train)) self.assertEqual(20, len(data)) data = list(dataset_utils.dataset_as_numpy(ds_test)) self.assertEqual(10, len(data))
def test_with_configs(self): with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir: builder1 = DummyDatasetWithConfigs(config="plus1", data_dir=tmp_dir) builder2 = DummyDatasetWithConfigs(config="plus2", data_dir=tmp_dir) # Test that builder.builder_config is the correct config self.assertIs(builder1.builder_config, DummyDatasetWithConfigs.builder_configs["plus1"]) self.assertIs(builder2.builder_config, DummyDatasetWithConfigs.builder_configs["plus2"]) builder1.download_and_prepare() builder2.download_and_prepare() data_dir1 = os.path.join(tmp_dir, builder1.name, "plus1", "0.0.1") data_dir2 = os.path.join(tmp_dir, builder2.name, "plus2", "0.0.2") # Test that subdirectories were created per config self.assertTrue(tf.gfile.Exists(data_dir1)) self.assertTrue(tf.gfile.Exists(data_dir2)) # 2 train shards, 1 test shard, plus metadata files self.assertGreater(len(tf.gfile.ListDirectory(data_dir1)), 3) self.assertGreater(len(tf.gfile.ListDirectory(data_dir2)), 3) # Test that the config was used and they didn't collide. splits_list = [splits_lib.Split.TRAIN, splits_lib.Split.TEST] for builder, incr in [(builder1, 1), (builder2, 2)]: train_data, test_data = [ [el["x"] for el in dataset_utils.dataset_as_numpy(builder.as_dataset(split=split))] for split in splits_list ] self.assertEqual(20, len(train_data)) self.assertEqual(10, len(test_data)) self.assertEqual([incr + el for el in range(30)], sorted(train_data + test_data))
def features_encode_decode(features_dict, example, as_tensor=False): """Runs the full pipeline: encode > write > tmp files > read > decode.""" # Encode example encoded_example = features_dict.encode_example(example) with tmp_dir() as tmp_dir_: tmp_filename = os.path.join(tmp_dir_, "tmp.tfrecord") # Read/write the file file_adapter = file_format_adapter.TFRecordExampleAdapter( features_dict.get_serialized_info()) file_adapter.write_from_generator( generator_fn=lambda: [encoded_example], output_files=[tmp_filename], ) dataset = file_adapter.dataset_from_filename(tmp_filename) # Decode the example dataset = dataset.map(features_dict.decode_example) if not as_tensor: # Evaluate to numpy array for el in dataset_utils.dataset_as_numpy(dataset): return el else: if tf.executing_eagerly(): return next(iter(dataset)) else: return tf.compat.v1.data.make_one_shot_iterator( dataset).get_next()
def test_determinism(self): with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir: ds = registered.load( name="dummy_dataset_shared_generator", data_dir=tmp_dir, split=splits_lib.Split.TRAIN, as_dataset_kwargs=dict(shuffle_files=False)) ds_values = list(dataset_utils.dataset_as_numpy(ds)) # Ensure determinism. If this test fail, this mean that numpy random # module isn't always determinist (maybe between version, architecture, # ...), and so our datasets aren't guarantee either l = list(range(20)) np.random.RandomState(42).shuffle(l) self.assertEqual(l, [ 0, 17, 15, 1, 8, 5, 11, 3, 18, 16, 13, 2, 9, 19, 4, 12, 7, 10, 14, 6 ]) # Ensure determinism. If this test fails, this mean the dataset are not # deterministically generated. self.assertEqual( [e["x"] for e in ds_values], [24, 1, 3, 4, 15, 25, 0, 16, 21, 10, 6, 13, 27, 22, 12, 28, 9, 19, 18, 7], )
def test_shared_generator(self): with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir: builder = DummyDatasetSharedGenerator(data_dir=tmp_dir) builder.download_and_prepare() written_filepaths = [ os.path.join(builder._data_dir, fname) for fname in tf.io.gfile.listdir(builder._data_dir) ] # The data_dir contains the cached directory by default expected_filepaths = builder._build_split_filenames( split_info_list=builder.info.splits.values()) expected_filepaths.append( os.path.join(builder._data_dir, "dataset_info.json")) self.assertEqual(sorted(expected_filepaths), sorted(written_filepaths)) splits_list = [splits_lib.Split.TRAIN, splits_lib.Split.TEST] train_data, test_data = [[ el["x"] for el in dataset_utils.dataset_as_numpy( builder.as_dataset(split=split)) ] for split in splits_list] self.assertEqual(20, len(train_data)) self.assertEqual(10, len(test_data)) self.assertEqual(list(range(30)), sorted(train_data + test_data)) # Builder's info should also have the above information. self.assertTrue(builder.info.initialized) self.assertEqual( 20, builder.info.splits[splits_lib.Split.TRAIN].num_examples) self.assertEqual( 10, builder.info.splits[splits_lib.Split.TEST].num_examples) self.assertEqual(30, builder.info.splits.total_num_examples)
def test_nested_dataset_simultaneous_access(self): ds1 = _create_dataset(range(10)) ds2 = _create_dataset(range(10, 20)) np_ds = dataset_utils.dataset_as_numpy((ds1, {"a": ds2})) np_ds1 = np_ds[0] np_ds2 = np_ds[1]["a"] for i1, i2 in zip(np_ds1, np_ds2): self.assertEqual(i2, int(i1) + 10)
def test_nested_dataset_sequential_access(self): ds1 = _create_dataset(range(10)) ds2 = _create_dataset(range(10, 20)) np_ds = dataset_utils.dataset_as_numpy((ds1, {"a": ds2})) np_ds1 = np_ds[0] np_ds2 = np_ds[1]["a"] self.assertEqual(list(range(10)), [int(el) for el in list(np_ds1)]) self.assertEqual(list(range(10, 20)), [int(el) for el in list(np_ds2)])
def test_singleton_dataset_with_nested_elements(self): ds = _create_dataset(range(10)) ds = ds.map(lambda el: {"a": el, "b": el + 1, "c": (el + 2, el + 3)}) np_ds = dataset_utils.dataset_as_numpy(ds) for i, el in enumerate(np_ds): self.assertEqual(i, el["a"]) self.assertEqual(i + 1, el["b"]) self.assertEqual(i + 2, el["c"][0]) self.assertEqual(i + 3, el["c"][1])
def test_load(self): with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir: dataset = registered.load(name="dummy_dataset_shared_generator", data_dir=tmp_dir, download=True, split=splits_lib.Split.TRAIN) data = list(dataset_utils.dataset_as_numpy(dataset)) self.assertEqual(20, len(data)) self.assertLess(data[0]["x"], 30)
def test_nested_tensors(self): t1 = tf.random.normal((10, 10)) t2 = tf.random.normal((10, 20)) nest_tup = (t1, t2) np_t1, np_t2 = dataset_utils.dataset_as_numpy(nest_tup) self.assertEqual((10, 10), np_t1.shape) self.assertEqual(np.float32, np_t1.dtype) self.assertEqual((10, 20), np_t2.shape) self.assertEqual(np.float32, np_t2.dtype) nest_dict = {"foo": t1, "bar": {"zoo": t2}} np_nest_dict = dataset_utils.dataset_as_numpy(nest_dict) np_t1 = np_nest_dict["foo"] np_t2 = np_nest_dict["bar"]["zoo"] self.assertEqual((10, 10), np_t1.shape) self.assertEqual(np.float32, np_t1.dtype) self.assertEqual((10, 20), np_t2.shape) self.assertEqual(np.float32, np_t2.dtype)
def test_with_batch_size(self): items = list(dataset_utils.dataset_as_numpy(self.builder.as_dataset( split=splits_lib.Split.TRAIN + splits_lib.Split.TEST, batch_size=10))) # 3 batches of 10 self.assertEqual(3, len(items)) x1, x2, x3 = items[0]["x"], items[1]["x"], items[2]["x"] self.assertEqual(10, x1.shape[0]) self.assertEqual(10, x2.shape[0]) self.assertEqual(10, x3.shape[0]) self.assertEqual(sum(range(30)), int(x1.sum() + x2.sum() + x3.sum()))
def test_tensors_match(self): t = tf.random.uniform( shape=(50, 3), maxval=1000, dtype=tf.int32, ) ds = dataset_utils.dataset_as_numpy({"a": t, "b": t}) # sess.run() should be called a single time for all input. Otherwise input # and target may not match self.assertAllEqual(ds["a"], ds["b"])
def test_nested_dataset_nested_elements(self): ds1 = _create_dataset(range(10)) ds1 = ds1.map(lambda el: {"a": el, "b": el + 1, "c": (el + 2, el + 3)}) ds2 = _create_dataset(range(10, 20)) np_ds = dataset_utils.dataset_as_numpy((ds1, {"a": ds2})) np_ds1 = np_ds[0] np_ds2 = np_ds[1]["a"] for i, (el1, el2) in enumerate(zip(np_ds1, np_ds2)): self.assertEqual(i + 10, el2) self.assertEqual(i, el1["a"]) self.assertEqual(i + 1, el1["b"]) self.assertEqual(i + 2, el1["c"][0]) self.assertEqual(i + 3, el1["c"][1])
def test_all_splits(self): splits = dataset_utils.dataset_as_numpy( self.builder.as_dataset(batch_size=-1)) self.assertSetEqual(set(splits.keys()), set([splits_lib.Split.TRAIN, splits_lib.Split.TEST])) # Test that enum and string both access same object self.assertIs(splits["train"], splits[splits_lib.Split.TRAIN]) self.assertIs(splits["test"], splits[splits_lib.Split.TEST]) train_data = splits[splits_lib.Split.TRAIN]["x"] test_data = splits[splits_lib.Split.TEST]["x"] self.assertEqual(20, len(train_data)) self.assertEqual(10, len(test_data)) self.assertEqual(sum(range(30)), int(train_data.sum() + test_data.sum()))
def _assertAsDataset(self, builder): split_to_checksums = {} # {"split": set(examples_checksums)} for split_name, expected_examples_number in self.SPLITS.items(): dataset = builder.as_dataset(split=split_name) compare_shapes_and_types(builder.info.features.get_tensor_info(), dataset.output_types, dataset.output_shapes) examples = list( dataset_utils.dataset_as_numpy( builder.as_dataset(split=split_name))) split_to_checksums[split_name] = set( checksum(rec) for rec in examples) self.assertLen(examples, expected_examples_number) for (split1, hashes1), (split2, hashes2) in itertools.combinations( split_to_checksums.items(), 2): if (split1 in self.OVERLAPPING_SPLITS or split2 in self.OVERLAPPING_SPLITS): continue self.assertFalse(hashes1.intersection(hashes2), ( "Splits '%s' and '%s' are overlapping. Are you sure you want to " "have the same objects in those splits? If yes, add one one of " "them to OVERLAPPING_SPLITS class attribute.") % (split1, split2))
def test_singleton_tensor(self): t = tf.random.normal((10, 10)) np_t = dataset_utils.dataset_as_numpy(t) self.assertEqual((10, 10), np_t.shape) self.assertEqual(np.float32, np_t.dtype)
def test_supervised_keys(self): x, _ = dataset_utils.dataset_as_numpy(self.builder.as_dataset( split=splits_lib.Split.TRAIN, as_supervised=True, batch_size=-1)) self.assertEqual(x.shape[0], 20)
def test_singleton_dataset(self): ds = _create_dataset(range(10)) np_ds = dataset_utils.dataset_as_numpy(ds) self.assertEqual(list(range(10)), [int(el) for el in list(np_ds)])
def get_dataset_feature_statistics(builder, split): """Calculate statistics for the specified split.""" statistics = statistics_pb2.DatasetFeatureStatistics() # Make this to the best of our abilities. schema = schema_pb2.Schema() dataset = builder.as_dataset(split=split) # Just computing the number of examples for now. statistics.num_examples = 0 # Feature dictionaries. feature_to_num_examples = collections.defaultdict(int) feature_to_min = {} feature_to_max = {} np_dataset = dataset_utils.dataset_as_numpy(dataset) for example in tqdm.tqdm(np_dataset, unit=" examples"): statistics.num_examples += 1 assert isinstance(example, dict) feature_names = sorted(example.keys()) for feature_name in feature_names: # Update the number of examples this feature appears in. feature_to_num_examples[feature_name] += 1 feature_np = example[feature_name] # For compatibility in graph and eager mode, we can get PODs here and # everything may not be neatly wrapped up in numpy's ndarray. feature_dtype = type(feature_np) if isinstance(feature_np, np.ndarray): feature_dtype = feature_np.dtype.type feature_min, feature_max = None, None is_numeric = (np.issubdtype(feature_dtype, np.number) or feature_dtype == np.bool_) if is_numeric: feature_min = np.min(feature_np) feature_max = np.max(feature_np) # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add # logic for that. # Set or update the min, max. if is_numeric: if ((feature_name not in feature_to_min) or (feature_to_min[feature_name] > feature_min)): feature_to_min[feature_name] = feature_min if ((feature_name not in feature_to_max) or (feature_to_max[feature_name] < feature_max)): feature_to_max[feature_name] = feature_max # Start here, we've processed all examples. output_shapes_dict = dataset.output_shapes output_types_dict = dataset.output_types for feature_name in sorted(feature_to_num_examples.keys()): # Try to fill in the schema. feature = schema.feature.add() feature.name = feature_name # TODO(afrozm): Make this work with nested structures, currently the Schema # proto has no support for it. maybe_feature_shape = output_shapes_dict[feature_name] if not isinstance(maybe_feature_shape, tf.TensorShape): logging.error( "Statistics generation doesn't work for nested structures yet") continue for dim in maybe_feature_shape.as_list(): # We denote `None`s as -1 in the shape proto. feature.shape.dim.add().size = dim if dim else -1 feature_type = output_types_dict[feature_name] feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES) common_statistics = statistics_pb2.CommonStatistics() common_statistics.num_non_missing = feature_to_num_examples[ feature_name] common_statistics.num_missing = (statistics.num_examples - common_statistics.num_non_missing) feature_name_statistics = statistics.features.add() feature_name_statistics.name = feature_name # TODO(afrozm): This can be skipped, since type information was added to # the Schema. feature_name_statistics.type = _SCHEMA_TYPE_MAP.get( feature.type, statistics_pb2.FeatureNameStatistics.BYTES) if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT: numeric_statistics = statistics_pb2.NumericStatistics() numeric_statistics.min = feature_to_min[feature_name] numeric_statistics.max = feature_to_max[feature_name] numeric_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.num_stats.CopyFrom(numeric_statistics) else: # Let's shove it into BytesStatistics for now. bytes_statistics = statistics_pb2.BytesStatistics() bytes_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics) return statistics, schema
def test_with_graph(self): with tf.Graph().as_default() as g: ds = _create_dataset(range(10)) np_ds = dataset_utils.dataset_as_numpy(ds, graph=g) self.assertEqual(list(range(10)), [int(el) for el in list(np_ds)])