Example #1
0
    def test_multi_split(self):
        with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
            ds_train, ds_test = registered.load(
                name="dummy_dataset_shared_generator",
                data_dir=tmp_dir,
                split=[splits_lib.Split.TRAIN, splits_lib.Split.TEST],
                as_dataset_kwargs=dict(shuffle_files=False))

            data = list(dataset_utils.dataset_as_numpy(ds_train))
            self.assertEqual(20, len(data))

            data = list(dataset_utils.dataset_as_numpy(ds_test))
            self.assertEqual(10, len(data))
Example #2
0
  def test_with_configs(self):
    with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
      builder1 = DummyDatasetWithConfigs(config="plus1", data_dir=tmp_dir)
      builder2 = DummyDatasetWithConfigs(config="plus2", data_dir=tmp_dir)
      # Test that builder.builder_config is the correct config
      self.assertIs(builder1.builder_config,
                    DummyDatasetWithConfigs.builder_configs["plus1"])
      self.assertIs(builder2.builder_config,
                    DummyDatasetWithConfigs.builder_configs["plus2"])
      builder1.download_and_prepare()
      builder2.download_and_prepare()
      data_dir1 = os.path.join(tmp_dir, builder1.name, "plus1", "0.0.1")
      data_dir2 = os.path.join(tmp_dir, builder2.name, "plus2", "0.0.2")
      # Test that subdirectories were created per config
      self.assertTrue(tf.gfile.Exists(data_dir1))
      self.assertTrue(tf.gfile.Exists(data_dir2))
      # 2 train shards, 1 test shard, plus metadata files
      self.assertGreater(len(tf.gfile.ListDirectory(data_dir1)), 3)
      self.assertGreater(len(tf.gfile.ListDirectory(data_dir2)), 3)

      # Test that the config was used and they didn't collide.
      splits_list = [splits_lib.Split.TRAIN, splits_lib.Split.TEST]
      for builder, incr in [(builder1, 1), (builder2, 2)]:
        train_data, test_data = [
            [el["x"] for el in
             dataset_utils.dataset_as_numpy(builder.as_dataset(split=split))]
            for split in splits_list
        ]

        self.assertEqual(20, len(train_data))
        self.assertEqual(10, len(test_data))
        self.assertEqual([incr + el for el in range(30)],
                         sorted(train_data + test_data))
Example #3
0
def features_encode_decode(features_dict, example, as_tensor=False):
    """Runs the full pipeline: encode > write > tmp files > read > decode."""
    # Encode example
    encoded_example = features_dict.encode_example(example)

    with tmp_dir() as tmp_dir_:
        tmp_filename = os.path.join(tmp_dir_, "tmp.tfrecord")

        # Read/write the file
        file_adapter = file_format_adapter.TFRecordExampleAdapter(
            features_dict.get_serialized_info())
        file_adapter.write_from_generator(
            generator_fn=lambda: [encoded_example],
            output_files=[tmp_filename],
        )
        dataset = file_adapter.dataset_from_filename(tmp_filename)

        # Decode the example
        dataset = dataset.map(features_dict.decode_example)

        if not as_tensor:  # Evaluate to numpy array
            for el in dataset_utils.dataset_as_numpy(dataset):
                return el
        else:
            if tf.executing_eagerly():
                return next(iter(dataset))
            else:
                return tf.compat.v1.data.make_one_shot_iterator(
                    dataset).get_next()
Example #4
0
  def test_determinism(self):
    with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
      ds = registered.load(
          name="dummy_dataset_shared_generator",
          data_dir=tmp_dir,
          split=splits_lib.Split.TRAIN,
          as_dataset_kwargs=dict(shuffle_files=False))
      ds_values = list(dataset_utils.dataset_as_numpy(ds))

      # Ensure determinism. If this test fail, this mean that numpy random
      # module isn't always determinist (maybe between version, architecture,
      # ...), and so our datasets aren't guarantee either
      l = list(range(20))
      np.random.RandomState(42).shuffle(l)
      self.assertEqual(l, [
          0, 17, 15, 1, 8, 5, 11, 3, 18, 16, 13, 2, 9, 19, 4, 12, 7, 10, 14, 6
      ])

      # Ensure determinism. If this test fails, this mean the dataset are not
      # deterministically generated.
      self.assertEqual(
          [e["x"] for e in ds_values],
          [24, 1, 3, 4, 15, 25, 0, 16, 21, 10, 6, 13, 27, 22, 12, 28, 9, 19,
           18, 7],
      )
Example #5
0
    def test_shared_generator(self):
        with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
            builder = DummyDatasetSharedGenerator(data_dir=tmp_dir)
            builder.download_and_prepare()

            written_filepaths = [
                os.path.join(builder._data_dir, fname)
                for fname in tf.io.gfile.listdir(builder._data_dir)
            ]
            # The data_dir contains the cached directory by default
            expected_filepaths = builder._build_split_filenames(
                split_info_list=builder.info.splits.values())
            expected_filepaths.append(
                os.path.join(builder._data_dir, "dataset_info.json"))
            self.assertEqual(sorted(expected_filepaths),
                             sorted(written_filepaths))

            splits_list = [splits_lib.Split.TRAIN, splits_lib.Split.TEST]
            train_data, test_data = [[
                el["x"] for el in dataset_utils.dataset_as_numpy(
                    builder.as_dataset(split=split))
            ] for split in splits_list]

            self.assertEqual(20, len(train_data))
            self.assertEqual(10, len(test_data))
            self.assertEqual(list(range(30)), sorted(train_data + test_data))

            # Builder's info should also have the above information.
            self.assertTrue(builder.info.initialized)
            self.assertEqual(
                20, builder.info.splits[splits_lib.Split.TRAIN].num_examples)
            self.assertEqual(
                10, builder.info.splits[splits_lib.Split.TEST].num_examples)
            self.assertEqual(30, builder.info.splits.total_num_examples)
Example #6
0
    def test_nested_dataset_simultaneous_access(self):
        ds1 = _create_dataset(range(10))
        ds2 = _create_dataset(range(10, 20))
        np_ds = dataset_utils.dataset_as_numpy((ds1, {"a": ds2}))
        np_ds1 = np_ds[0]
        np_ds2 = np_ds[1]["a"]

        for i1, i2 in zip(np_ds1, np_ds2):
            self.assertEqual(i2, int(i1) + 10)
Example #7
0
    def test_nested_dataset_sequential_access(self):
        ds1 = _create_dataset(range(10))
        ds2 = _create_dataset(range(10, 20))
        np_ds = dataset_utils.dataset_as_numpy((ds1, {"a": ds2}))
        np_ds1 = np_ds[0]
        np_ds2 = np_ds[1]["a"]

        self.assertEqual(list(range(10)), [int(el) for el in list(np_ds1)])
        self.assertEqual(list(range(10, 20)), [int(el) for el in list(np_ds2)])
Example #8
0
 def test_singleton_dataset_with_nested_elements(self):
     ds = _create_dataset(range(10))
     ds = ds.map(lambda el: {"a": el, "b": el + 1, "c": (el + 2, el + 3)})
     np_ds = dataset_utils.dataset_as_numpy(ds)
     for i, el in enumerate(np_ds):
         self.assertEqual(i, el["a"])
         self.assertEqual(i + 1, el["b"])
         self.assertEqual(i + 2, el["c"][0])
         self.assertEqual(i + 3, el["c"][1])
Example #9
0
 def test_load(self):
     with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
         dataset = registered.load(name="dummy_dataset_shared_generator",
                                   data_dir=tmp_dir,
                                   download=True,
                                   split=splits_lib.Split.TRAIN)
         data = list(dataset_utils.dataset_as_numpy(dataset))
         self.assertEqual(20, len(data))
         self.assertLess(data[0]["x"], 30)
Example #10
0
    def test_nested_tensors(self):
        t1 = tf.random.normal((10, 10))
        t2 = tf.random.normal((10, 20))
        nest_tup = (t1, t2)
        np_t1, np_t2 = dataset_utils.dataset_as_numpy(nest_tup)
        self.assertEqual((10, 10), np_t1.shape)
        self.assertEqual(np.float32, np_t1.dtype)
        self.assertEqual((10, 20), np_t2.shape)
        self.assertEqual(np.float32, np_t2.dtype)

        nest_dict = {"foo": t1, "bar": {"zoo": t2}}
        np_nest_dict = dataset_utils.dataset_as_numpy(nest_dict)
        np_t1 = np_nest_dict["foo"]
        np_t2 = np_nest_dict["bar"]["zoo"]
        self.assertEqual((10, 10), np_t1.shape)
        self.assertEqual(np.float32, np_t1.dtype)
        self.assertEqual((10, 20), np_t2.shape)
        self.assertEqual(np.float32, np_t2.dtype)
Example #11
0
 def test_with_batch_size(self):
   items = list(dataset_utils.dataset_as_numpy(self.builder.as_dataset(
       split=splits_lib.Split.TRAIN + splits_lib.Split.TEST, batch_size=10)))
   # 3 batches of 10
   self.assertEqual(3, len(items))
   x1, x2, x3 = items[0]["x"], items[1]["x"], items[2]["x"]
   self.assertEqual(10, x1.shape[0])
   self.assertEqual(10, x2.shape[0])
   self.assertEqual(10, x3.shape[0])
   self.assertEqual(sum(range(30)), int(x1.sum() + x2.sum() + x3.sum()))
Example #12
0
    def test_tensors_match(self):
        t = tf.random.uniform(
            shape=(50, 3),
            maxval=1000,
            dtype=tf.int32,
        )

        ds = dataset_utils.dataset_as_numpy({"a": t, "b": t})
        # sess.run() should be called a single time for all input. Otherwise input
        # and target may not match
        self.assertAllEqual(ds["a"], ds["b"])
Example #13
0
    def test_nested_dataset_nested_elements(self):
        ds1 = _create_dataset(range(10))
        ds1 = ds1.map(lambda el: {"a": el, "b": el + 1, "c": (el + 2, el + 3)})
        ds2 = _create_dataset(range(10, 20))
        np_ds = dataset_utils.dataset_as_numpy((ds1, {"a": ds2}))
        np_ds1 = np_ds[0]
        np_ds2 = np_ds[1]["a"]

        for i, (el1, el2) in enumerate(zip(np_ds1, np_ds2)):
            self.assertEqual(i + 10, el2)
            self.assertEqual(i, el1["a"])
            self.assertEqual(i + 1, el1["b"])
            self.assertEqual(i + 2, el1["c"][0])
            self.assertEqual(i + 3, el1["c"][1])
Example #14
0
  def test_all_splits(self):
    splits = dataset_utils.dataset_as_numpy(
        self.builder.as_dataset(batch_size=-1))
    self.assertSetEqual(set(splits.keys()),
                        set([splits_lib.Split.TRAIN, splits_lib.Split.TEST]))

    # Test that enum and string both access same object
    self.assertIs(splits["train"], splits[splits_lib.Split.TRAIN])
    self.assertIs(splits["test"], splits[splits_lib.Split.TEST])

    train_data = splits[splits_lib.Split.TRAIN]["x"]
    test_data = splits[splits_lib.Split.TEST]["x"]
    self.assertEqual(20, len(train_data))
    self.assertEqual(10, len(test_data))
    self.assertEqual(sum(range(30)), int(train_data.sum() + test_data.sum()))
Example #15
0
 def _assertAsDataset(self, builder):
     split_to_checksums = {}  # {"split": set(examples_checksums)}
     for split_name, expected_examples_number in self.SPLITS.items():
         dataset = builder.as_dataset(split=split_name)
         compare_shapes_and_types(builder.info.features.get_tensor_info(),
                                  dataset.output_types,
                                  dataset.output_shapes)
         examples = list(
             dataset_utils.dataset_as_numpy(
                 builder.as_dataset(split=split_name)))
         split_to_checksums[split_name] = set(
             checksum(rec) for rec in examples)
         self.assertLen(examples, expected_examples_number)
     for (split1, hashes1), (split2, hashes2) in itertools.combinations(
             split_to_checksums.items(), 2):
         if (split1 in self.OVERLAPPING_SPLITS
                 or split2 in self.OVERLAPPING_SPLITS):
             continue
         self.assertFalse(hashes1.intersection(hashes2), (
             "Splits '%s' and '%s' are overlapping. Are you sure you want to "
             "have the same objects in those splits? If yes, add one one of "
             "them to OVERLAPPING_SPLITS class attribute.") %
                          (split1, split2))
Example #16
0
 def test_singleton_tensor(self):
     t = tf.random.normal((10, 10))
     np_t = dataset_utils.dataset_as_numpy(t)
     self.assertEqual((10, 10), np_t.shape)
     self.assertEqual(np.float32, np_t.dtype)
Example #17
0
 def test_supervised_keys(self):
   x, _ = dataset_utils.dataset_as_numpy(self.builder.as_dataset(
       split=splits_lib.Split.TRAIN, as_supervised=True, batch_size=-1))
   self.assertEqual(x.shape[0], 20)
Example #18
0
 def test_singleton_dataset(self):
     ds = _create_dataset(range(10))
     np_ds = dataset_utils.dataset_as_numpy(ds)
     self.assertEqual(list(range(10)), [int(el) for el in list(np_ds)])
Example #19
0
def get_dataset_feature_statistics(builder, split):
    """Calculate statistics for the specified split."""
    statistics = statistics_pb2.DatasetFeatureStatistics()

    # Make this to the best of our abilities.
    schema = schema_pb2.Schema()

    dataset = builder.as_dataset(split=split)

    # Just computing the number of examples for now.
    statistics.num_examples = 0

    # Feature dictionaries.
    feature_to_num_examples = collections.defaultdict(int)
    feature_to_min = {}
    feature_to_max = {}

    np_dataset = dataset_utils.dataset_as_numpy(dataset)
    for example in tqdm.tqdm(np_dataset, unit=" examples"):
        statistics.num_examples += 1

        assert isinstance(example, dict)

        feature_names = sorted(example.keys())
        for feature_name in feature_names:

            # Update the number of examples this feature appears in.
            feature_to_num_examples[feature_name] += 1

            feature_np = example[feature_name]

            # For compatibility in graph and eager mode, we can get PODs here and
            # everything may not be neatly wrapped up in numpy's ndarray.

            feature_dtype = type(feature_np)

            if isinstance(feature_np, np.ndarray):
                feature_dtype = feature_np.dtype.type

            feature_min, feature_max = None, None
            is_numeric = (np.issubdtype(feature_dtype, np.number)
                          or feature_dtype == np.bool_)
            if is_numeric:
                feature_min = np.min(feature_np)
                feature_max = np.max(feature_np)

            # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add
            # logic for that.

            # Set or update the min, max.
            if is_numeric:
                if ((feature_name not in feature_to_min)
                        or (feature_to_min[feature_name] > feature_min)):
                    feature_to_min[feature_name] = feature_min

                if ((feature_name not in feature_to_max)
                        or (feature_to_max[feature_name] < feature_max)):
                    feature_to_max[feature_name] = feature_max

    # Start here, we've processed all examples.

    output_shapes_dict = dataset.output_shapes
    output_types_dict = dataset.output_types

    for feature_name in sorted(feature_to_num_examples.keys()):
        # Try to fill in the schema.
        feature = schema.feature.add()
        feature.name = feature_name

        # TODO(afrozm): Make this work with nested structures, currently the Schema
        # proto has no support for it.
        maybe_feature_shape = output_shapes_dict[feature_name]
        if not isinstance(maybe_feature_shape, tf.TensorShape):
            logging.error(
                "Statistics generation doesn't work for nested structures yet")
            continue

        for dim in maybe_feature_shape.as_list():
            # We denote `None`s as -1 in the shape proto.
            feature.shape.dim.add().size = dim if dim else -1
        feature_type = output_types_dict[feature_name]
        feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES)

        common_statistics = statistics_pb2.CommonStatistics()
        common_statistics.num_non_missing = feature_to_num_examples[
            feature_name]
        common_statistics.num_missing = (statistics.num_examples -
                                         common_statistics.num_non_missing)

        feature_name_statistics = statistics.features.add()
        feature_name_statistics.name = feature_name

        # TODO(afrozm): This can be skipped, since type information was added to
        # the Schema.
        feature_name_statistics.type = _SCHEMA_TYPE_MAP.get(
            feature.type, statistics_pb2.FeatureNameStatistics.BYTES)

        if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT:
            numeric_statistics = statistics_pb2.NumericStatistics()
            numeric_statistics.min = feature_to_min[feature_name]
            numeric_statistics.max = feature_to_max[feature_name]
            numeric_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.num_stats.CopyFrom(numeric_statistics)
        else:
            # Let's shove it into BytesStatistics for now.
            bytes_statistics = statistics_pb2.BytesStatistics()
            bytes_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics)

    return statistics, schema
Example #20
0
 def test_with_graph(self):
     with tf.Graph().as_default() as g:
         ds = _create_dataset(range(10))
     np_ds = dataset_utils.dataset_as_numpy(ds, graph=g)
     self.assertEqual(list(range(10)), [int(el) for el in list(np_ds)])