def test_builder_from_directory(code_builder: dataset_builder.DatasetBuilder):
    """Builder can be created from the files only."""

    # Reconstruct the dataset
    builder = read_only_builder.builder_from_directory(code_builder.data_dir)
    assert builder.name == code_builder.name
    assert builder.data_dir == code_builder.data_dir
    assert builder.info.version == code_builder.info.version
    assert builder.info.full_name == code_builder.info.full_name
    assert repr(builder.info) == repr(code_builder.info)
    assert builder.VERSION == code_builder.info.version
    assert builder.RELEASE_NOTES == code_builder.info.release_notes
    assert builder.__module__ == type(code_builder).__module__
    assert read_only_builder.ReadOnlyBuilder.VERSION is None

    if code_builder.builder_config:
        assert builder.builder_config
        code_config = code_builder.builder_config
        file_config = builder.builder_config
        # Config attributes should be restored too
        assert code_config.name == file_config.name
        assert code_config.description == file_config.description
        assert code_config.version == file_config.version

    # Test that the dataset can be read
    ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5))
    origin_ds = dataset_utils.as_numpy(
        builder.as_dataset(split='train').take(5))
    assert [ex['id'] for ex in ds] == [ex['id'] for ex in origin_ds]

    builder.download_and_prepare()  # Should be a no-op

    # Test pickling and un-pickling
    builder2 = dill.loads(dill.dumps(builder))
    assert builder.name == builder2.name
    assert builder.version == builder2.version
  def test_all_splits(self):
    splits = dataset_utils.as_numpy(
        self.builder.as_dataset(batch_size=-1))
    self.assertSetEqual(set(splits.keys()),
                        set([splits_lib.Split.TRAIN, splits_lib.Split.TEST]))

    # Test that enum and string both access same object
    self.assertIs(splits["train"], splits[splits_lib.Split.TRAIN])
    self.assertIs(splits["test"], splits[splits_lib.Split.TEST])

    train_data = splits[splits_lib.Split.TRAIN]["x"]
    test_data = splits[splits_lib.Split.TEST]["x"]
    self.assertEqual(20, len(train_data))
    self.assertEqual(10, len(test_data))
    self.assertEqual(sum(range(30)), int(train_data.sum() + test_data.sum()))
Beispiel #3
0
    def test_custom_as_dataset(self):
        def _as_dataset(self, *args, **kwargs):  # pylint: disable=unused-argument
            return tf.data.Dataset.from_generator(
                lambda: (
                    {  # pylint: disable=g-long-lambda
                        'text': t,
                    } for t in ['some sentence', 'some other sentence']),
                output_types=self.info.features.dtype,
                output_shapes=self.info.features.shape,
            )

        with mocking.mock_data(as_dataset_fn=_as_dataset):
            ds = registered.load('lm1b', split='train')
            out = [ex['text'] for ex in dataset_utils.as_numpy(ds)]
            self.assertEqual(out, [b'some sentence', b'some other sentence'])
Beispiel #4
0
 def test_supervised_keys_nested(self):
     self.builder = DummyDatasetWithSupervisedKeys(
         data_dir=self._tfds_tmp_dir,
         supervised_keys=("x", ("x", ("x", "x")), {
             "a": "x",
             "b": ("x", )
         }))
     single, pair, a_dict = dataset_utils.as_numpy(
         self.builder.as_dataset(split=splits_lib.Split.TRAIN,
                                 as_supervised=True,
                                 batch_size=-1))
     self.assertEqual(single.shape[0], 20)
     self.assertLen(pair, 2)
     self.assertEqual(pair[1][1].shape[0], 20)
     self.assertLen(a_dict, 2)
     self.assertEqual(a_dict["b"][0].shape[0], 20)
Beispiel #5
0
def main(_):
    builder_kwargs = {
        "validation_split": flags.validation_split
    }

    tfdataset_path = local_settings.TF_DATASET_PATH
    if flags.tfds_path is not None:
        tfdataset_path = flags.tfds_path

    train, dsinfo = tfds.load("pacs", 
        data_dir=tfdataset_path, split=tfds.Split.VALIDATION,
        builder_kwargs=builder_kwargs, with_info=True)

    for example in dataset_utils.as_numpy(train):
        import pdb; pdb.set_trace()
        print(example["attributes"]["label"])
    def test_decoding(self):

        self.assertFeatureEagerOnly(
            feature=feature_lib.Dataset(
                {
                    'a': tf.string,
                    'b': {
                        'c': tf.uint8,
                    }
                }, length=None),
            shape={
                'a': (),
                'b': {
                    'c': (),
                }
            },
            dtype={
                'a': tf.string,
                'b': {
                    'c': tf.uint8,
                }
            },
            tests=[
                testing.FeatureExpectationItem(
                    value=dataset_utils.as_numpy(
                        tf.data.Dataset.from_tensor_slices({
                            'a': ['aa', 'b', 'ccc'],
                            'b': {
                                'c': [1, 2, 3],
                            }
                        })),
                    decoders={
                        'b': {
                            'c': IncrementDecoder(),
                        },
                    },
                    expected=tf.data.Dataset.from_tensor_slices({
                        'a':
                        [tf.compat.as_bytes(t) for t in ('aa', 'b', 'ccc')],
                        'b': {
                            'c': [2, 3, 4],
                        }
                    }),
                ),
            ],
        )
Beispiel #7
0
    def test_nested_sequence(self):
        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            ds_train, ds_info = registered.load(name="nested_sequence_builder",
                                                data_dir=tmp_dir,
                                                split="train",
                                                with_info=True,
                                                shuffle_files=False)
            ex0, ex1, ex2 = [
                ex["frames"]["coordinates"]
                for ex in dataset_utils.as_numpy(ds_train)
            ]
            self.assertAllEqual(
                ex0,
                tf.ragged.constant([
                    [[0, 1], [2, 3], [4, 5]],
                    [],
                    [[6, 7]],
                ],
                                   inner_shape=(2, )))
            self.assertAllEqual(ex1, tf.ragged.constant([], ragged_rank=1))
            self.assertAllEqual(
                ex2,
                tf.ragged.constant([
                    [[10, 11]],
                    [[12, 13], [14, 15]],
                ],
                                   inner_shape=(2, )))

            self.assertEqual(
                ds_info.features.dtype,
                {"frames": {
                    "coordinates": tf.int32
                }},
            )
            self.assertEqual(
                ds_info.features.shape,
                {"frames": {
                    "coordinates": (None, None, 2)
                }},
            )
            nested_tensor_info = ds_info.features.get_tensor_info()
            self.assertEqual(
                nested_tensor_info["frames"]["coordinates"].sequence_rank,
                2,
            )
Beispiel #8
0
def features_encode_decode(features_dict, example, decoders):
    """Runs the full pipeline: encode > write > tmp files > read > decode."""
    # Serialize/deserialize the example
    serialized_example = features_dict.serialize_example(example)

    decode_fn = functools.partial(
        features_dict.deserialize_example,
        decoders=decoders,
    )
    ds = tf.data.Dataset.from_tensors(serialized_example)
    ds = ds.map(decode_fn)

    if tf.executing_eagerly():
        out_tensor = next(iter(ds))
    else:
        out_tensor = tf.compat.v1.data.make_one_shot_iterator(ds).get_next()
    out_numpy = dataset_utils.as_numpy(out_tensor)
    return out_tensor, out_numpy, ds.element_spec
Beispiel #9
0
def _read_records(path, file_format=file_adapters.DEFAULT_FILE_FORMAT):
  """Returns (files_names, list_of_records_in_each_file).

  Args:
    path: path to tfrecord, omitting suffix.
    file_format: format of the record files.
  """
  # Ignore _index.json files.
  paths = sorted(tf.io.gfile.glob('%s-*-of-*' % path))
  paths = [p for p in paths if not p.endswith(writer_lib._INDEX_PATH_SUFFIX)]
  all_recs = []
  for fpath in paths:
    all_recs.append(
        list(
            dataset_utils.as_numpy(
                file_adapters.ADAPTER_FOR_FORMAT[file_format].make_tf_data(
                    fpath))))
  return [os.path.basename(p) for p in paths], all_recs
    def assertFeature(self, specs, serialized_info, tests):
        """Test the TFRecordExampleAdapter encoding."""

        adapter = file_format_adapter.TFRecordExampleAdapter(specs)

        with self._subTest("serialized_info"):
            self.assertEqual(serialized_info,
                             adapter._parser._build_feature_specs())

        for i, test in enumerate(tests):
            with self._subTest(str(i)):

                if test.raise_cls is not None:
                    with self.assertRaisesWithPredicateMatch(
                            test.raise_cls, test.raise_msg):
                        adapter._serializer.serialize_example(test.value)
                    continue
                serialized = adapter._serializer.serialize_example(test.value)

                if test.expected_serialized is not None:
                    example_proto = tf.train.Example()
                    example_proto.ParseFromString(serialized)
                    expected_proto = tf.train.Example(
                        features=tf.train.Features(
                            feature=test.expected_serialized))
                    self.assertEqual(expected_proto, example_proto)

                example = _parse_example(serialized,
                                         adapter._parser.parse_example)

                with self._subTest("dtype"):
                    out_dtypes = utils.map_nested(lambda s: s.dtype, example)
                    expected_dtypes = utils.map_nested(lambda s: s.dtype,
                                                       specs)
                    self.assertEqual(out_dtypes, expected_dtypes)
                with self._subTest("shape"):
                    # For shape, because (None, 3) match with (5, 3), we use
                    # tf.TensorShape.assert_is_compatible_with on each of the elements
                    utils.map_nested(
                        lambda x: x[0].shape.assert_is_compatible_with(x[1].
                                                                       shape),
                        utils.zip_nested(example, specs))
                np_example = dataset_utils.as_numpy(example)
                self.assertAllEqualNested(np_example, test.expected)
Beispiel #11
0
  def _assertAsDataset(self, builder):
    """Check the label distribution.

    This checks that lable get correctly converted between the synset ids
    and integers.

    Args:
      builder: The ImagenetA dataset builder.
    """
    super()._assertAsDataset(builder)
    label_frequncies = collections.Counter()
    label_feature = builder.info.features['label']
    dataset = builder.as_dataset()
    for features in dataset_utils.as_numpy(dataset['test']):
      label_frequncies.update([label_feature.int2str(features['label'])])
    self.assertEqual(dict(label_frequncies),
                     {'n01580077': 2,
                      'n01616318': 3,
                      'n07697313': 5})
Beispiel #12
0
  def test_ragged_tensors(self):
    rt = tf.ragged.constant([
        [1, 2, 3],
        [],
        [4, 5],
    ])
    rt = dataset_utils.as_numpy(rt)

    if not tf.executing_eagerly():
      # Output of `sess.run(rt)` is a `RaggedTensorValue` object
      self.assertIsInstance(rt, tf.compat.v1.ragged.RaggedTensorValue)
    else:
      self.assertIsInstance(rt, tf.RaggedTensor)

    self.assertAllEqual(rt, tf.ragged.constant([
        [1, 2, 3],
        [],
        [4, 5],
    ]))
    def _assertBeamGeneration(self, dl_config, dataset_cls, dataset_name):
        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            builder = dataset_cls(data_dir=tmp_dir)
            builder.download_and_prepare(download_config=dl_config)

            data_dir = os.path.join(tmp_dir, dataset_name, "1.0.0")
            self.assertEqual(data_dir, builder._data_dir)

            # Check number of shards
            self._assertShards(
                data_dir,
                pattern="%s-test.tfrecord-{:05}-of-{:05}" % dataset_name,
                # Liquid sharding is not guaranteed to always use the same number.
                num_shards=builder.info.splits["test"].num_shards,
            )
            self._assertShards(
                data_dir,
                pattern="%s-train.tfrecord-{:05}-of-{:05}" % dataset_name,
                num_shards=1,
            )

            datasets = dataset_utils.as_numpy(builder.as_dataset())

            def get_id(ex):
                return ex["id"]

            self._assertElemsAllEqual(
                sorted(list(datasets["test"]), key=get_id),
                sorted([_gen_example(i)[1] for i in range(725)], key=get_id),
            )
            self._assertElemsAllEqual(
                sorted(list(datasets["train"]), key=get_id),
                sorted([_gen_example(i)[1] for i in range(1000)], key=get_id),
            )

            self.assertDictEqual(
                builder.info.metadata, {
                    "label_sum_1000": 500,
                    "id_mean_1000": 499.5,
                    "label_sum_725": 362,
                    "id_mean_725": 362.0,
                })
  def test_with_batch_size(self):
    items = list(dataset_utils.as_numpy(self.builder.as_dataset(
        split="train+test", batch_size=10)))
    # 3 batches of 10
    self.assertEqual(3, len(items))
    x1, x2, x3 = items[0]["x"], items[1]["x"], items[2]["x"]
    self.assertEqual(10, x1.shape[0])
    self.assertEqual(10, x2.shape[0])
    self.assertEqual(10, x3.shape[0])
    self.assertEqual(sum(range(30)), int(x1.sum() + x2.sum() + x3.sum()))

    # By default batch_size is None and won't add a batch dimension
    ds = self.builder.as_dataset(split=splits_lib.Split.TRAIN)
    self.assertEqual(0, len(tf.compat.v1.data.get_output_shapes(ds)["x"]))
    # Setting batch_size=1 will add an extra batch dimension
    ds = self.builder.as_dataset(split=splits_lib.Split.TRAIN, batch_size=1)
    self.assertEqual(1, len(tf.compat.v1.data.get_output_shapes(ds)["x"]))
    # Setting batch_size=2 will add an extra batch dimension
    ds = self.builder.as_dataset(split=splits_lib.Split.TRAIN, batch_size=2)
    self.assertEqual(1, len(tf.compat.v1.data.get_output_shapes(ds)["x"]))
 def _assertAsDataset(self, builder):
   split_to_checksums = {}  # {"split": set(examples_checksums)}
   for split_name, expected_examples_number in self.SPLITS.items():
     dataset = builder.as_dataset(split=split_name)
     compare_shapes_and_types(builder.info.features.get_tensor_info(),
                              dataset.output_types, dataset.output_shapes)
     examples = list(dataset_utils.as_numpy(
         builder.as_dataset(split=split_name)))
     split_to_checksums[split_name] = set(checksum(rec) for rec in examples)
     self.assertLen(examples, expected_examples_number)
   for (split1, hashes1), (split2, hashes2) in itertools.combinations(
       split_to_checksums.items(), 2):
     if (split1 in self.OVERLAPPING_SPLITS or
         split2 in self.OVERLAPPING_SPLITS):
       continue
     self.assertFalse(
         hashes1.intersection(hashes2),
         ("Splits '%s' and '%s' are overlapping. Are you sure you want to "
          "have the same objects in those splits? If yes, add one one of "
          "them to OVERLAPPING_SPLITS class attribute.") % (split1, split2))
def test_beam_datasets(
    tmp_path: pathlib.Path,
    dataset_cls: dataset_builder.GeneratorBasedBuilder,
    make_dl_config: Callable[[], download.DownloadConfig],
):
    dataset_name = dataset_cls.name

    builder = dataset_cls(data_dir=tmp_path)
    builder.download_and_prepare(download_config=make_dl_config())

    data_path = tmp_path / dataset_name / '1.0.0'
    assert data_path.exists()  # Dataset has been generated

    # Check number of shards/generated files
    _test_shards(
        data_path,
        pattern='%s-test.tfrecord-{:05}-of-{:05}' % dataset_name,
        # Liquid sharding is not guaranteed to always use the same number.
        num_shards=builder.info.splits['test'].num_shards,
    )
    _test_shards(
        data_path,
        pattern='%s-train.tfrecord-{:05}-of-{:05}' % dataset_name,
        num_shards=1,
    )

    ds = dataset_utils.as_numpy(builder.as_dataset())

    def get_id(ex):
        return ex['id']

    _assert_values_equal(
        sorted(list(ds['test']), key=get_id),
        sorted([_gen_example(i)[1] for i in range(725)], key=get_id),
    )
    _assert_values_equal(
        sorted(list(ds['train']), key=get_id),
        sorted([_gen_example(i)[1] for i in range(1000)], key=get_id),
    )

    assert builder.info.metadata == builder.EXPECTED_METADATA
  def test_determinism(self):
    ds = self.builder.as_dataset(
        split=splits_lib.Split.TRAIN, shuffle_files=False)
    ds_values = list(dataset_utils.as_numpy(ds))

    # Ensure determinism. If this test fail, this mean that numpy random
    # module isn't always determinist (maybe between version, architecture,
    # ...), and so our datasets aren't guaranteed either.
    l = list(range(20))
    np.random.RandomState(42).shuffle(l)
    self.assertEqual(l, [
        0, 17, 15, 1, 8, 5, 11, 3, 18, 16, 13, 2, 9, 19, 4, 12, 7, 10, 14, 6
    ])

    # Ensure determinism. If this test fails, this mean the dataset are not
    # deterministically generated.
    self.assertEqual(
        [e["x"] for e in ds_values],
        [6, 16, 19, 12, 14, 18, 5, 13, 15, 4, 10, 17, 0, 8, 3, 1, 9, 7, 11,
         2],
    )
Beispiel #18
0
  def test_ragged_tensors_ds(self):
    def _gen_ragged_tensors():
      # Yield the (flat_values, rowids)
      yield ([0, 1, 2, 3], [0, 0, 0, 2])  # ex0
      yield ([], [])  # ex1
      yield ([4, 5, 6], [0, 1, 1])  # ex2
    ds = tf.data.Dataset.from_generator(
        _gen_ragged_tensors,
        output_types=(tf.int64, tf.int64),
        output_shapes=((None,), (None,))
    )
    ds = ds.map(tf.RaggedTensor.from_value_rowids)

    rt0, rt1, rt2 = list(dataset_utils.as_numpy(ds))
    self.assertAllEqual(rt0, [
        [0, 1, 2],
        [],
        [3,],
    ])
    self.assertAllEqual(rt1, [])
    self.assertAllEqual(rt2, [[4], [5, 6]])
Beispiel #19
0
def as_dataframe(
    ds: tf.data.Dataset,
    ds_info: Optional[dataset_info.DatasetInfo] = None,
) -> StyledDataFrame:
    """Convert the dataset into a pandas dataframe.

  Warning: The dataframe will be loaded entirely in memory, you may
  want to call `tfds.as_dataframe` on a subset of the data instead:

  ```
  df = tfds.as_dataframe(ds.take(10), ds_info)
  ```

  Args:
    ds: `tf.data.Dataset`. The tf.data.Dataset object to convert to panda
      dataframe. Examples should not be batched. The full dataset will be
      loaded.
    ds_info: Dataset info object. If given, helps improving the formatting.
      Available either through `tfds.load('mnist', with_info=True)` or
      `tfds.builder('mnist').info`

  Returns:
    dataframe: The `pandas.DataFrame` object
  """
    # Raise a clean error message if panda isn't installed.
    lazy_imports_lib.lazy_imports.pandas  # pylint: disable=pointless-statement

    # Pack `as_supervised=True` datasets
    if ds_info:
        ds = dataset_info.pack_as_supervised_ds(ds, ds_info)

    # Flatten the keys names, specs,... while keeping the feature key definition
    # order
    columns = _make_columns(ds.element_spec, ds_info=ds_info)
    rows = [_make_row_dict(ex, columns) for ex in dataset_utils.as_numpy(ds)]
    df = StyledDataFrame(rows)
    df.current_style.format(
        {c.name: c.format_fn
         for c in columns if c.format_fn})
    return df
Beispiel #20
0
 def _assertAsDataset(self, builder):
   """Check the label distribution for each split."""
   super(Ucf101Test, self)._assertAsDataset(builder)
   label_frequncies = {}
   label_feature = builder.info.features['label']
   dataset = builder.as_dataset()
   for split_name in Ucf101Test.SPLITS:
     label_frequncies[split_name] = collections.defaultdict(int)
     for features in dataset_utils.as_numpy(dataset[split_name]):
       label_name = label_feature.int2str(features['label'])
       label_frequncies[split_name][label_name] += 1
   self.assertEqual(
       dict(label_frequncies), {
           'test': {
               'Archery': 1,
               'Nunchucks': 1
           },
           'train': {
               'Archery': 1,
               'Nunchucks': 2
           }
       })
Beispiel #21
0
  def _assertAsDataset(self, builder):
    """Check the label distribution.

    This checks that labels get correctly converted between the synset ids
    and integers.

    Args:
      builder: The ImagenetR dataset builder.
    """
    super()._assertAsDataset(builder)
    label_frequncies = collections.Counter()
    label_feature = builder.info.features['label']
    dataset = builder.as_dataset()
    filenames = []
    for features in dataset_utils.as_numpy(dataset['test']):
      label_frequncies.update([label_feature.int2str(features['label'])])
      filenames.append(features['file_name'])
    self.assertEqual(dict(label_frequncies),
                     {'n01443537': 2,
                      'n01484850': 3,
                      'n12267677': 5})
    self.assertIn(b'n01443537/1.jpeg', filenames)
Beispiel #22
0
    def test_with_configs(self):
        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            builder1 = DummyDatasetWithConfigs(config="plus1",
                                               data_dir=tmp_dir)
            builder2 = DummyDatasetWithConfigs(config="plus2",
                                               data_dir=tmp_dir)
            # Test that builder.builder_config is the correct config
            self.assertIs(builder1.builder_config,
                          DummyDatasetWithConfigs.builder_configs["plus1"])
            self.assertIs(builder2.builder_config,
                          DummyDatasetWithConfigs.builder_configs["plus2"])
            builder1.download_and_prepare()
            builder2.download_and_prepare()
            data_dir1 = os.path.join(tmp_dir, builder1.name, "plus1", "0.0.1")
            data_dir2 = os.path.join(tmp_dir, builder2.name, "plus2", "0.0.2")
            # Test that subdirectories were created per config
            self.assertTrue(tf.io.gfile.exists(data_dir1))
            self.assertTrue(tf.io.gfile.exists(data_dir2))
            # 1 train shard, 1 test shard, plus metadata files
            self.assertGreater(len(tf.io.gfile.listdir(data_dir1)), 2)
            self.assertGreater(len(tf.io.gfile.listdir(data_dir2)), 2)

            # Test that the config was used and they didn't collide.
            splits_list = ["train", "test"]
            for builder, incr in [(builder1, 1), (builder2, 2)]:
                train_data, test_data = [  # pylint: disable=g-complex-comprehension
                    [
                        el["x"] for el in  # pylint: disable=g-complex-comprehension
                        dataset_utils.as_numpy(builder.as_dataset(split=split))
                    ] for split in splits_list
                ]

                self.assertEqual(20, len(train_data))
                self.assertEqual(10, len(test_data))
                self.assertCountEqual([incr + el for el in range(30)],
                                      train_data + test_data)
 def _assertAsDataset(self, builder):
   split_to_checksums = {}  # {"split": set(examples_checksums)}
   for split_name, expected_examples_number in self.SPLITS.items():
     ds = builder.as_dataset(split=split_name)
     spec = tf.data.DatasetSpec.from_value(ds)
     compare_shapes_and_types(
         builder.info.features.get_tensor_info(),
         # We use _element_spec because element_spec was added in TF2.5+.
         element_spec=spec._element_spec,  # pylint: disable=protected-access
     )
     examples = list(
         dataset_utils.as_numpy(builder.as_dataset(split=split_name)))
     split_to_checksums[split_name] = set(checksum(rec) for rec in examples)
     self.assertLen(examples, expected_examples_number)
   for (split1, hashes1), (split2, hashes2) in itertools.combinations(
       split_to_checksums.items(), 2):
     if (split1 in self.OVERLAPPING_SPLITS or
         split2 in self.OVERLAPPING_SPLITS):
       continue
     self.assertFalse(
         hashes1.intersection(hashes2),
         ("Splits '%s' and '%s' are overlapping. Are you sure you want to "
          "have the same objects in those splits? If yes, add one one of "
          "them to OVERLAPPING_SPLITS class attribute.") % (split1, split2))
    def test_label(self):

        self.assertFeatureEagerOnly(
            feature=feature_lib.Dataset(
                {
                    'label': feature_lib.ClassLabel(names=['left', 'right']),
                },
                length=None),
            shape={'label': ()},
            dtype={'label': tf.int64},
            serialized_info={
                'label': feature_lib.TensorInfo(shape=(None, ),
                                                dtype=tf.int64),
            },
            tests=[
                testing.FeatureExpectationItem(
                    value=[{
                        'label': 'right'
                    }, {
                        'label': 'left'
                    }, {
                        'label': 'left'
                    }],
                    expected=tf.data.Dataset.from_tensor_slices(
                        {'label': [1, 0, 0]}),
                ),
                # Variable sequence length
                testing.FeatureExpectationItem(
                    value=dataset_utils.as_numpy(
                        tf.data.Dataset.from_tensor_slices(
                            {'label': ['right', 'left', 'right', 'left']})),
                    expected=tf.data.Dataset.from_tensor_slices(
                        {'label': [1, 0, 1, 0]}),
                ),
            ],
            test_attributes=dict(_length=None))
Beispiel #25
0
  def test_shared_generator(self):
    with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
      builder = DummyDatasetSharedGenerator(data_dir=tmp_dir)
      builder.download_and_prepare()

      written_filepaths = [
          os.path.join(builder._data_dir, fname)
          for fname in tf.io.gfile.listdir(builder._data_dir)
      ]
      # The data_dir contains the cached directory by default
      expected_filepaths = builder._build_split_filenames(
          split_info_list=builder.info.splits.values())
      expected_filepaths.append(
          os.path.join(builder._data_dir, "dataset_info.json"))
      self.assertEqual(sorted(expected_filepaths), sorted(written_filepaths))

      splits_list = [
          splits_lib.Split.TRAIN, splits_lib.Split.TEST
      ]
      train_data, test_data = [
          [el["x"] for el in
           dataset_utils.as_numpy(builder.as_dataset(split=split))]
          for split in splits_list
      ]

      self.assertEqual(20, len(train_data))
      self.assertEqual(10, len(test_data))
      self.assertEqual(list(range(30)), sorted(train_data + test_data))

      # Builder's info should also have the above information.
      self.assertTrue(builder.info.initialized)
      self.assertEqual(20,
                       builder.info.splits[splits_lib.Split.TRAIN].num_examples)
      self.assertEqual(10,
                       builder.info.splits[splits_lib.Split.TEST].num_examples)
      self.assertEqual(30, builder.info.splits.total_num_examples)
Beispiel #26
0
 def test_with_graph(self):
   with tf.Graph().as_default():
     with tf.Graph().as_default() as g:
       ds = _create_dataset(range(10))
     np_ds = dataset_utils.as_numpy(ds, graph=g)
     self.assertEqual(list(range(10)), [int(el) for el in list(np_ds)])
Beispiel #27
0
 def test_supervised_keys(self):
     x, _ = dataset_utils.as_numpy(
         self.builder.as_dataset(split=splits_lib.Split.TRAIN,
                                 as_supervised=True,
                                 batch_size=-1))
     self.assertEqual(x.shape[0], 20)
Beispiel #28
0
    def _build_single_dataset(self, split, shuffle_files, batch_size, decoders,
                              as_supervised, in_memory):
        """as_dataset for a single split."""
        if isinstance(split, six.string_types):
            split = splits_lib.Split(split)

        wants_full_dataset = batch_size == -1
        if wants_full_dataset:
            batch_size = self.info.splits.total_num_examples or sys.maxsize

        # If the dataset is small, load it in memory
        dataset_shape_is_fully_defined = (
            dataset_utils.features_shape_is_fully_defined(self.info.features))
        in_memory_default = False
        # TODO(tfds): Consider default in_memory=True for small datasets with
        # fully-defined shape.
        # Expose and use the actual data size on disk and rm the manual
        # name guards. size_in_bytes is the download size, which is misleading,
        # particularly for datasets that use manual_dir as well as some downloads
        # (wmt and diabetic_retinopathy_detection).
        # in_memory_default = (
        #     self.info.size_in_bytes and
        #     self.info.size_in_bytes <= 1e9 and
        #     not self.name.startswith("wmt") and
        #     not self.name.startswith("diabetic") and
        #     dataset_shape_is_fully_defined)
        in_memory = in_memory_default if in_memory is None else in_memory

        # Build base dataset
        if in_memory and not wants_full_dataset:
            # TODO(tfds): Enable in_memory without padding features. May be able
            # to do by using a requested version of tf.data.Dataset.cache that can
            # persist a cache beyond iterator instances.
            if not dataset_shape_is_fully_defined:
                logging.warning(
                    "Called in_memory=True on a dataset that does not "
                    "have fully defined shapes. Note that features with "
                    "variable length dimensions will be 0-padded to "
                    "the maximum length across the dataset.")
            full_bs = self.info.splits.total_num_examples or sys.maxsize
            # If using in_memory, escape all device contexts so we can load the data
            # with a local Session.
            with tf.device(None):
                dataset = self._as_dataset(split=split,
                                           shuffle_files=shuffle_files,
                                           decoders=decoders)
                # Use padded_batch so that features with unknown shape are supported.
                dataset = dataset.padded_batch(
                    full_bs, tf.compat.v1.data.get_output_shapes(dataset))
                dataset = tf.data.Dataset.from_tensor_slices(
                    next(dataset_utils.as_numpy(dataset)))
        else:
            dataset = self._as_dataset(split=split,
                                       shuffle_files=shuffle_files,
                                       decoders=decoders)

        if batch_size:
            # Use padded_batch so that features with unknown shape are supported.
            dataset = dataset.padded_batch(
                batch_size, tf.compat.v1.data.get_output_shapes(dataset))

        if as_supervised:
            if not self.info.supervised_keys:
                raise ValueError(
                    "as_supervised=True but %s does not support a supervised "
                    "(input, label) structure." % self.name)
            input_f, target_f = self.info.supervised_keys
            dataset = dataset.map(
                lambda fs: (fs[input_f], fs[target_f]),
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        # If shuffling, allow pipeline to be non-deterministic
        options = tf.data.Options()
        options.experimental_deterministic = not shuffle_files
        dataset = dataset.with_options(options)

        if wants_full_dataset:
            return tf.data.experimental.get_single_element(dataset)
        return dataset
Beispiel #29
0
def get_dataset_feature_statistics(builder, split):
  """Calculate statistics for the specified split."""
  statistics = statistics_pb2.DatasetFeatureStatistics()

  # Make this to the best of our abilities.
  schema = schema_pb2.Schema()

  dataset = builder.as_dataset(split=split)

  # Just computing the number of examples for now.
  statistics.num_examples = 0

  # Feature dictionaries.
  feature_to_num_examples = collections.defaultdict(int)
  feature_to_min = {}
  feature_to_max = {}

  np_dataset = dataset_utils.as_numpy(dataset)
  for example in utils.tqdm(np_dataset, unit=" examples", leave=False):
    statistics.num_examples += 1

    assert isinstance(example, dict)

    feature_names = sorted(example.keys())
    for feature_name in feature_names:

      # Update the number of examples this feature appears in.
      feature_to_num_examples[feature_name] += 1

      feature_np = example[feature_name]

      # For compatibility in graph and eager mode, we can get PODs here and
      # everything may not be neatly wrapped up in numpy's ndarray.

      feature_dtype = type(feature_np)

      if isinstance(feature_np, np.ndarray):
        # If we have an empty array, then don't proceed further with computing
        # statistics on it.
        if feature_np.size == 0:
          continue

        feature_dtype = feature_np.dtype.type

      feature_min, feature_max = None, None
      is_numeric = (np.issubdtype(feature_dtype, np.number) or
                    feature_dtype == np.bool_)
      if is_numeric:
        feature_min = np.min(feature_np)
        feature_max = np.max(feature_np)

      # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add
      # logic for that.

      # Set or update the min, max.
      if is_numeric:
        if ((feature_name not in feature_to_min) or
            (feature_to_min[feature_name] > feature_min)):
          feature_to_min[feature_name] = feature_min

        if ((feature_name not in feature_to_max) or
            (feature_to_max[feature_name] < feature_max)):
          feature_to_max[feature_name] = feature_max

  # Start here, we've processed all examples.

  output_shapes_dict = dataset.output_shapes
  output_types_dict = dataset.output_types

  for feature_name in sorted(feature_to_num_examples.keys()):
    # Try to fill in the schema.
    feature = schema.feature.add()
    feature.name = feature_name

    # TODO(afrozm): Make this work with nested structures, currently the Schema
    # proto has no support for it.
    maybe_feature_shape = output_shapes_dict[feature_name]
    if not isinstance(maybe_feature_shape, tf.TensorShape):
      logging.error(
          "Statistics generation doesn't work for nested structures yet")
      continue

    for dim in maybe_feature_shape.as_list():
      # We denote `None`s as -1 in the shape proto.
      feature.shape.dim.add().size = dim if dim else -1
    feature_type = output_types_dict[feature_name]
    feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES)

    common_statistics = statistics_pb2.CommonStatistics()
    common_statistics.num_non_missing = feature_to_num_examples[feature_name]
    common_statistics.num_missing = (
        statistics.num_examples - common_statistics.num_non_missing)

    feature_name_statistics = statistics.features.add()
    feature_name_statistics.name = feature_name

    # TODO(afrozm): This can be skipped, since type information was added to
    # the Schema.
    feature_name_statistics.type = _SCHEMA_TYPE_MAP.get(
        feature.type, statistics_pb2.FeatureNameStatistics.BYTES)

    if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT:
      numeric_statistics = statistics_pb2.NumericStatistics()
      # Uses `.get` as Sequence(int) containing only empty array won't contains
      # any value.
      numeric_statistics.min = feature_to_min.get(feature_name, 0)
      numeric_statistics.max = feature_to_max.get(feature_name, 0)
      numeric_statistics.common_stats.CopyFrom(common_statistics)
      feature_name_statistics.num_stats.CopyFrom(numeric_statistics)
    else:
      # Let's shove it into BytesStatistics for now.
      bytes_statistics = statistics_pb2.BytesStatistics()
      bytes_statistics.common_stats.CopyFrom(common_statistics)
      feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics)

  return statistics, schema
Beispiel #30
0
 def test_in_memory(self):
     train_data = dataset_utils.as_numpy(
         self.builder.as_dataset(split="train", in_memory=True))
     train_data = [el for el in train_data]
     self.assertEqual(20, len(train_data))