def default_hparams(): r"""Returns a dictionary of default hyperparameters. See the specific subclasses for the details. """ hparams = DatasetBase.default_hparams() hparams.update({ "bucket_boundaries": [], "bucket_batch_sizes": None, "bucket_length_fn": None}) return hparams
def _test_data(self, data: DatasetBase, returns_data: bool = False, always_returns_data: bool = False): sampler = BufferShuffleSampler(data, self.buffer_size) for epoch in range(2): indices = list(iter(sampler)) if always_returns_data or (returns_data and epoch == 0): examples = [ex[1] for ex in indices] indices = [ex[0] for ex in indices] np.testing.assert_array_equal(indices, examples) self.assertEqual(len(set(indices)), self.size) self.assertEqual(min(indices), 0) self.assertEqual(max(indices), self.size - 1) data._fully_cached = True
def default_hparams(): r"""Returns a dictionary of default hyperparameters. .. code-block:: python { # (1) Hyperparams specific to scalar dataset "dataset": { "files": [], "compression_type": None, "data_type": "int", "other_transformations": [], "data_name": "data", } # (2) General hyperparams "num_epochs": 1, "batch_size": 64, "allow_smaller_final_batch": True, "shuffle": True, "shuffle_buffer_size": None, "shard_and_shuffle": False, "num_parallel_calls": 1, "prefetch_buffer_size": 0, "max_dataset_size": -1, "seed": None, "name": "scalar_data", } Here: 1. For the hyperparameters in the :attr:`"dataset"` field: `"files"`: str or list A (list of) file path(s). Each line contains a single scalar number. `"compression_type"`: str, optional One of "" (no compression), "ZLIB", or "GZIP". `"data_type"`: str The scalar type. Types defined in :meth:`~texar.torch.utils.dtypes.get_supported_scalar_types` are supported. `"other_transformations"`: list A list of transformation functions or function names/paths to further transform each single data instance. (More documentations to be added.) `"data_name"`: str Name of the dataset. 2. For the **general** hyperparameters, see :meth:`texar.torch.data.DatasetBase.default_hparams` for details. """ hparams = DatasetBase.default_hparams() hparams["name"] = "scalar_data" hparams.update({"dataset": _default_scalar_dataset_hparams()}) return hparams
def default_hparams(): r"""Returns a dictionary of default hyperparameters. .. code-block:: python { # (1) Hyperparameters specific to the record data 'dataset': { 'files': [], 'feature_types': {}, 'feature_convert_types': {}, 'image_options': {}, "num_shards": None, "shard_id": None, "other_transformations": [], "data_name": None, } # (2) General hyperparameters "num_epochs": 1, "batch_size": 64, "allow_smaller_final_batch": True, "shuffle": True, "shuffle_buffer_size": None, "shard_and_shuffle": False, "num_parallel_calls": 1, "prefetch_buffer_size": 0, "max_dataset_size": -1, "seed": None, "name": "tfrecord_data", } Here: 1. For the hyperparameters in the :attr:`"dataset"` field: `"files"`: str or list A (list of) pickled file path(s). `"feature_types"`: dict The feature names (`str`) with their descriptions in the form of ``feature_name: [dtype, feature_collate_method, shape]``: - ``dtype`` is a Python type (``int``, ``str``), dtype instance from PyTorch (``torch.float``), NumPy (``np.int64``), or TensorFlow (``tf.string``), or their stringified names such as ``"torch.float"`` and ``"np.int64"``. The feature will be read from the files and parsed into this dtype. - ``feature_collate_method`` is of type ``str``, and describes how features are collated in the batch. Available values are: - ``"stacked_tensor"``: Features are assumed to be tensors of a fixed shape (or scalars). When collating, features are stacked, with the batch dimension being the first dimension. This is the default value if ``feature_collate_method`` is not specified. For example: - 5 scalar features -> a tensor of shape [5]. - 4 tensor features, each of shape [6, 5] -> a tensor of shape [4, 6, 5]. - ``"padded_tensor"``: Features are assumed to be tensors, with all dimensions except the first having the same size. When collating, features are padded with zero values along the end of the first dimension so that every tensor has the same size, and then stacked, with the batch dimension being the first dimension. For example: - 3 tensor features, with shapes [4, 7, 8], [5, 7, 8], and [4, 7, 8] -> a tensor of shape [3, 5, 7, 8]. - ``"list"``: Features can be any objects. When collating, the features are stored in a Python list. - ``shape`` is optional, and can be of type ``int``, `tuple``, or ``torch.Size``. If specified, shapes of tensor features will be checked, depending on the ``feature_collate_method``: - ``"stacked_tensor"``: The shape of every feature tensor must be ``shape``. - ``"padded_tensor"``: The shape (excluding first dimension) of every feature tensor must be ``shape``. - ``"list"``: ``shape`` is ignored. .. note:: Shape check is performed before any transformations are applied. Example: .. code-block:: python feature_types = { "input_ids": ["int64", "stacked_tensor", 128], "label_ids": ["int64", "stacked_tensor"], "name_lists": ["string", "list"], } .. note:: This field is named `"feature_original_types"` in Texar-TF. This name is still supported, but is deprecated in favor of `"feature_types"`. Texar-TF also uses different names for feature types: - ``"FixedLenFeature"`` corresponds to ``"stacked_tensor"``. - ``"FixedLenSequenceFeature"`` corresponds to ``"padded_tensor"``. - ``"VarLenFeature"`` corresponds to ``"list"``. These names are also accepted in Texar-PyTorch, but are deprecated in favor of the new names. `"feature_convert_types"`: dict, optional Specifies dtype converting after reading the data files. This `dict` maps feature names to desired data dtypes. For example, you can first read a feature into dtype ``torch.int32`` by specifying in :attr:`"feature_types"` above, and convert the feature to dtype ``"torch.long"`` by specifying here. Features not specified here will not do dtype-convert. - ``dtype`` is a Python type (`int`, `str`), dtype instance from PyTorch (``torch.float``), NumPy (``np.int64``), or TensorFlow (``tf.string``), or their stringified names such as ``"torch.float"`` and ``"np.int64"``. Note that this converting process happens after all the data are restored. Example: .. code-block:: python feature_convert_types = { "input_ids": "int32", "label_ids": "int32", } `"image_options"`: dict, optional Specifies the image feature name and performs image resizing, includes three fields: - `"image_feature_name"`: str The name of the feature which contains the image data. If set, the image data will be restored in a `numpy.ndarray`. - `"resize_height"`: int The height of the image after resizing. - `"resize_width"`: int The width of the image after resizing. If any of :attr:`"resize_height"` or :attr:`"resize_width"` is not set, image data will be restored with original shape. `"num_shards"`: int, optional The number of data shards in distributed mode. Usually set to the number of processes in distributed computing. Used in combination with :attr:`"shard_id"`. .. warning:: Sharding is not yet supported. This option (and related ones below) will be ignored. `"shard_id"`: int, optional Sets the unique id to identify a shard. The module will processes only the corresponding shard of the whole data. Used in combination with :attr:`"num_shards"`. For example, in a case of distributed computing on 2 GPUs, the hyperparameters of the data module for the two processes can be configured as below, respectively. For GPU 0: .. code-block:: python dataset: { ... "num_shards": 2, "shard_id": 0 } For GPU 1: .. code-block:: python dataset: { ... "num_shards": 2, "shard_id": 1 } Also refer to `examples/bert` for a use case. `"other_transformations"`: list A list of transformation functions or function names/paths to further transform each single data instance. `"data_name"`: str Name of the dataset. 2. For the **general** hyperparameters, see :meth:`texar.torch.data.DatasetBase.default_hparams` for details. """ hparams = DatasetBase.default_hparams() hparams["name"] = "record_data" hparams.update({"dataset": _default_record_dataset_hparams()}) return hparams
def default_hparams(): r"""Returns a dictionary of default hyperparameters. .. code-block:: python { # (1) Hyperparameters specific to the record data 'dataset': { 'files': [], 'feature_original_types': {}, 'feature_convert_types': {}, 'image_options': {}, "num_shards": None, "shard_id": None, "other_transformations": [], "data_name": None, } # (2) General hyperparameters "num_epochs": 1, "batch_size": 64, "allow_smaller_final_batch": True, "shuffle": True, "shuffle_buffer_size": None, "shard_and_shuffle": False, "num_parallel_calls": 1, "prefetch_buffer_size": 0, "max_dataset_size": -1, "seed": None, "name": "tfrecord_data", } Here: 1. For the hyperparameters in the :attr:`"dataset"` field: `"files"`: str or list A (list of) pickled file path(s). `"feature_original_types"`: dict The feature names (`str`) with their data types and length types, key and value in pair ``feature_name: [dtype, feature_len_type, len]``, - ``dtype`` is a Python type (``int``, ``str``), dtype instance from PyTorch (``torch.float``), NumPy (``np.int64``), or TensorFlow (``tf.string``), or their stringified names such as ``"torch.float"`` and ``"np.int64"``. The feature will be read from the files and parsed into this dtype. - ``feature_len_type`` is of type ``str``, and can be either ``"FixedLenFeature"`` or ``"VarLenFeature"`` for fixed length features and non-fixed length features, respectively. - ``len`` is an ``int`` and is optional. It is the length for ``"FixedLenFeature"``. Ignored if ``"VarLenFeature"`` is used. Example: .. code-block:: python feature_original_types = { "input_ids": ["int64", "FixedLenFeature", 128], "label_ids": ["int64", "FixedLenFeature"], "name_lists": ["string", "VarLenFeature"], } `"feature_convert_types"`: dict, optional Specifies dtype converting after reading the data files. This `dict` maps feature names to desired data dtypes. For example, you can first read a feature into dtype ``torch.int32`` by specifying in :attr:`"feature_original_types"` above, and convert the feature to dtype ``"torch.long"`` by specifying here. Features not specified here will not do dtype-convert. - ``dtype`` is a Python type (`int`, `str`), dtype instance from PyTorch (``torch.float``), NumPy (``np.int64``), or TensorFlow (``tf.string``), or their stringified names such as ``"torch.float"`` and ``"np.int64"``. Note that this converting process happens after all the data are restored. Example: .. code-block:: python feature_convert_types = { "input_ids": "int32", "label_ids": "int32", } `"image_options"`: dict, optional Specifies the image feature name and performs image resizing, includes three fields: - `"image_feature_name"`: str The name of the feature which contains the image data. If set, the image data will be restored in a `numpy.ndarray`. - `"resize_height"`: int The height of the image after resizing. - `"resize_width"`: int The width of the image after resizing. If any of :attr:`"resize_height"` or :attr:`"resize_width"` is not set, image data will be restored with original shape. `"num_shards"`: int, optional The number of data shards in distributed mode. Usually set to the number of processes in distributed computing. Used in combination with :attr:`"shard_id"`. .. warning:: Sharding is not yet supported. This option (and related ones below) will be ignored. `"shard_id"`: int, optional Sets the unique id to identify a shard. The module will processes only the corresponding shard of the whole data. Used in combination with :attr:`"num_shards"`. For example, in a case of distributed computing on 2 GPUs, the hyperparameters of the data module for the two processes can be configured as below, respectively. For GPU 0: .. code-block:: python dataset: { ... "num_shards": 2, "shard_id": 0 } For GPU 1: .. code-block:: python dataset: { ... "num_shards": 2, "shard_id": 1 } Also refer to `examples/bert` for a use case. `"other_transformations"`: list A list of transformation functions or function names/paths to further transform each single data instance. `"data_name"`: str Name of the dataset. 2. For the **general** hyperparameters, see :meth:`texar.torch.data.DatasetBase.default_hparams` for details. """ hparams = DatasetBase.default_hparams() hparams["name"] = "record_data" hparams.update({"dataset": _default_record_dataset_hparams()}) return hparams