Ejemplo n.º 1
0
    def __init__(self, hparams):
        TextDataBase.__init__(self, hparams)
        # Defaultizes hparams of each dataset
        datasets_hparams = self._hparams.datasets
        defaultized_datasets_hparams = []
        for ds_hpms in datasets_hparams:
            data_type = ds_hpms.get("data_type", None)
            defaultized_ds_hpms = HParams(ds_hpms,
                                          _default_dataset_hparams(data_type))
            defaultized_datasets_hparams.append(defaultized_ds_hpms)
        self._hparams.datasets = defaultized_datasets_hparams

        with tf.name_scope(self.name, self.default_hparams()["name"]):
            self._make_data()
Ejemplo n.º 2
0
    def default_hparams():
        """Returns a dicitionary of default hyperparameters:

        .. code-block:: python

            {
                # (1) Hyperparams specific to text dataset
                "dataset": {
                    "files": [],
                    "compression_type": None,
                    "vocab_file": "",
                    "embedding_init": {},
                    "delimiter": " ",
                    "max_seq_length": None,
                    "length_filter_mode": "truncate",
                    "pad_to_max_seq_length": False,
                    "bos_token": "<BOS>"
                    "eos_token": "<EOS>"
                    "other_transformations": [],
                    "variable_utterance": False,
                    "utterance_delimiter": "|||",
                    "max_utterance_cnt": 5,
                    "data_name": None,
                }
                # (2) General hyperparams
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "mono_text_data",
                # (3) Bucketing
                "bucket_boundaries": [],
                "bucket_batch_sizes": None,
                "bucket_length_fn": None,
            }

        Here:

        1. For the hyperparameters in the :attr:`"dataset"` field:

            "files": str or list
                A (list of) text file path(s).

                Each line contains a single text sequence.

            "compression_type": str, optional
                One of "" (no compression), "ZLIB", or "GZIP".

            "vocab_file": str
                Path to vocabulary file. Each line of the file should contain
                one vocabulary token.

                Used to create an instance of :class:`~texar.tf.data.Vocab`.

            "embedding_init": dict
                The hyperparameters for pre-trained embedding loading and
                initialization.

                The structure and default values are defined in
                :meth:`texar.tf.data.Embedding.default_hparams`.

            "delimiter": str
                The delimiter to split each line of the text files into tokens.

            "max_seq_length": int, optional
                Maximum length of output sequences. Data samples exceeding the
                length will be truncated or discarded according to
                :attr:`"length_filter_mode"`. The length does not include
                any added
                :attr:`"bos_token"` or :attr:`"eos_token"`. If `None` (default),
                no filtering is performed.

            "length_filter_mode": str
                Either "truncate" or "discard". If "truncate" (default),
                tokens exceeding the :attr:`"max_seq_length"` will be truncated.
                If "discard", data samples longer than the
                :attr:`"max_seq_length"`
                will be discarded.

            "pad_to_max_seq_length": bool
                If `True`, pad all data instances to length
                :attr:`"max_seq_length"`.
                Raises error if :attr:`"max_seq_length"` is not provided.

            "bos_token": str
                The Begin-Of-Sequence token prepended to each sequence.

                Set to an empty string to avoid prepending.

            "eos_token": str
                The End-Of-Sequence token appended to each sequence.

                Set to an empty string to avoid appending.

            "other_transformations": list
                A list of transformation functions or function names/paths to
                further transform each single data instance.

                (More documentations to be added.)

            "variable_utterance": bool
                If `True`, each line of the text file is considered to contain
                multiple sequences (utterances) separated by
                :attr:`"utterance_delimiter"`.

                For example, in dialog data, each line can contain a series of
                dialog history utterances. See the example in
                `examples/hierarchical_dialog` for a use case.

            "utterance_delimiter": str
                The delimiter to split over utterance level. Should not be the
                same with :attr:`"delimiter"`. Used only when
                :attr:`"variable_utterance"``==True`.

            "max_utterance_cnt": int
                Maximally allowed number of utterances in a data instance.
                Extra utterances are truncated out.

            "data_name": str
                Name of the dataset.

        2. For the **general** hyperparameters, see
        :meth:`texar.tf.data.DataBase.default_hparams` for details.

        3. **Bucketing** is to group elements of the dataset together by length
        and then pad and batch. (See more at
        :tf_main:`bucket_by_sequence_length
        <contrib/data/bucket_by_sequence_length>`). For bucketing
        hyperparameters:

            "bucket_boundaries": list
                An int list containing the upper length boundaries of the
                buckets.

                Set to an empty list (default) to disable bucketing.

            "bucket_batch_sizes": list
                An int list containing batch size per bucket. Length should be
                `len(bucket_boundaries) + 1`.

                If `None`, every bucket whill have the same batch size specified
                in :attr:`batch_size`.

            "bucket_length_fn": str or callable
                Function maps dataset element to `tf.int32` scalar, determines
                the length of the element.

                This can be a function, or the name or full module path to the
                function. If function name is given, the function must be in the
                :mod:`texar.tf.custom` module.

                If `None` (default), length is determined by the number of
                tokens (including BOS and EOS if added) of the element.

        """
        hparams = TextDataBase.default_hparams()
        hparams["name"] = "mono_text_data"
        hparams.update({"dataset": _default_mono_text_dataset_hparams()})
        return hparams
Ejemplo n.º 3
0
 def __init__(self, hparams):
     TextDataBase.__init__(self, hparams)
     with tf.name_scope(self.name, self.default_hparams()["name"]):
         self._make_data()
Ejemplo n.º 4
0
    def default_hparams():
        """Returns a dicitionary of default hyperparameters.

        .. code-block:: python

            {
                # (1) Hyperparams specific to text dataset
                "source_dataset": {
                    "files": [],
                    "compression_type": None,
                    "vocab_file": "",
                    "embedding_init": {},
                    "delimiter": " ",
                    "max_seq_length": None,
                    "length_filter_mode": "truncate",
                    "pad_to_max_seq_length": False,
                    "bos_token": None,
                    "eos_token": "<EOS>",
                    "other_transformations": [],
                    "variable_utterance": False,
                    "utterance_delimiter": "|||",
                    "max_utterance_cnt": 5,
                    "data_name": "source",
                },
                "target_dataset": {
                    # ...
                    # Same fields are allowed as in "source_dataset" with the
                    # same default values, except the
                    # following new fields/values:
                    "bos_token": "<BOS>"
                    "vocab_share": False,
                    "embedding_init_share": False,
                    "processing_share": False,
                    "data_name": "target"
                }
                # (2) General hyperparams
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "paired_text_data",
                # (3) Bucketing
                "bucket_boundaries": [],
                "bucket_batch_sizes": None,
                "bucket_length_fn": None,
            }

        Here:

        1. Hyperparameters in the :attr:`"source_dataset"` and
        attr:`"target_dataset"` fields have the same definition as those
        in :meth:`texar.tf.data.MonoTextData.default_hparams`, for source and
        target text, respectively.

        For the new hyperparameters in "target_dataset":

            "vocab_share": bool
                Whether to share the vocabulary of source.
                If `True`, the vocab file of target is ignored.

            "embedding_init_share": bool
                Whether to share the embedding initial value of source. If
                `True`, :attr:`"embedding_init"` of target is ignored.

                :attr:`"vocab_share"` must be true to share the embedding
                initial value.

            "processing_share": bool
                Whether to share the processing configurations of source,
                including
                "delimiter", "bos_token", "eos_token", and
                "other_transformations".

        2. For the **general** hyperparameters, see
        :meth:`texar.tf.data.DataBase.default_hparams` for details.

        3. For **bucketing** hyperparameters, see
        :meth:`texar.tf.data.MonoTextData.default_hparams` for details, except
        that the default bucket_length_fn is the maximum sequence length
        of source and target sequences.

        """
        hparams = TextDataBase.default_hparams()
        hparams["name"] = "paired_text_data"
        hparams.update(_default_paired_text_dataset_hparams())
        return hparams
Ejemplo n.º 5
0
    def default_hparams():
        """Returns a dicitionary of default hyperparameters.

        .. code-block:: python

            {
                # (1) Hyperparams specific to text dataset
                "datasets": []
                # (2) General hyperparams
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "multi_aligned_data",
            }

        Here:

        1. "datasets" is a list of `dict` each of which specifies a
        dataset which can be text, scalar or TFRecord. The
        :attr:`"data_name"` field of each dataset is used as the name
        prefix of the data fields from the respective dataset. The
        :attr:`"data_name"` field of each dataset should not be the same.

            - For scalar dataset, the allowed hyperparameters and default \
            values are the same as the "dataset" field of \
            :meth:`texar.tf.data.ScalarData.default_hparams`. Note that \
            :attr:`"data_type"` must be explicily specified \
            (either "int" or "float"). \

            - For TFRecord dataset, the allowed hyperparameters and default \
            values are the same as the "dataset" field of \
            :meth:`texar.tf.data.TFRecordData.default_hparams`. Note that \
            :attr:`"data_type"` must be explicily specified \
            (tf_record"). \

            - For text dataset, the allowed hyperparameters and default values\
            are the same as the "dataset" filed of \
            :meth:`texar.tf.data.MonoTextData.default_hparams`, with several \
            extra hyperparameters:

                "data_type": str
                    The type of the dataset, one of {"text", "int", "float",
                    "tf_record"}. If set to "int" or "float", the dataset is
                    considered to be a scalar dataset. If set to "tf_record",
                    the dataset is considered to be a TFRecord dataset.
                    If not specified or set to "text", the dataset is
                    considered to be a text dataset.

                "vocab_share_with": int, optional
                    Share the vocabulary of a preceding text dataset with the
                    specified index in the list (starting from 0). The
                    specified dataset must be a text dataset, and must have
                    an index smaller than the current dataset.

                    If specified, the vocab file of current dataset is ignored.
                    Default is `None` which disables the vocab sharing.

                "embedding_init_share_with": int, optional
                    Share the embedding initial value of a preceding text
                    dataset with the specified index in the list (starting
                    from 0).
                    The specified dataset must be a text dataset, and must have
                    an index smaller than the current dataset.

                    If specified, the :attr:`"embedding_init"` field of
                    the current dataset is ignored. Default is `None` which
                    disables the initial value sharing.

                "processing_share_with": int, optional
                    Share the processing configurations of a preceding text
                    dataset with the specified index in the list (starting
                    from 0).
                    The specified dataset must be a text dataset, and must have
                    an index smaller than the current dataset.

                    If specified, relevant field of the current dataset are
                    ignored, including "delimiter", "bos_token", "eos_token",
                    and "other_transformations". Default is `None` which
                    disables the processing sharing.

        2. For the **general** hyperparameters, see
        :meth:`texar.tf.data.DataBase.default_hparams` for details.
        """
        hparams = TextDataBase.default_hparams()
        hparams["name"] = "multi_aligned_data"
        hparams["datasets"] = []
        return hparams