Example #1
0
    def __init__(self, encoder=None, encoder_config=None):
        """Constructs a Text FeatureConnector.

    Args:
      encoder: `tfds.features.text.TextEncoder`, an encoder that can convert
        text to integers. If None, the text will be utf-8 byte-encoded.
      encoder_config: `tfds.features.text.TextEncoderConfig`, needed if
        restoring from a file with `load_metadata`.
    """
        if encoder and encoder_config:
            raise ValueError(
                "If encoder is provided, encoder_config must be None.")
        if encoder:
            encoder_config = text_lib.TextEncoderConfig(
                encoder_cls=type(encoder), vocab_size=encoder.vocab_size)
        elif encoder_config:
            encoder = encoder_config.encoder

        self._encoder = encoder
        self._encoder_config = encoder_config

        has_encoder = bool(encoder or self._encoder_cls)
        super(Text, self).__init__(
            shape=(None, ) if has_encoder else (),
            dtype=tf.int64 if has_encoder else tf.string,
        )
Example #2
0
    def __init__(self, encoder=None, encoder_config=None):
        """Constructs a Text FeatureConnector.

    Args:
      encoder: `tfds.features.text.TextEncoder`, an encoder that can convert
        text to integers. If None, the text will be utf-8 byte-encoded.
      encoder_config: `tfds.features.text.TextEncoderConfig`, needed if
        restoring from a file with `load_metadata`.
    """
        if encoder and encoder_config:
            raise ValueError(
                "If encoder is provided, encoder_config must be None.")
        if encoder:
            encoder_config = text_lib.TextEncoderConfig(
                encoder_cls=type(encoder), vocab_size=encoder.vocab_size)
        elif encoder_config:
            encoder = encoder_config.encoder

        self._encoder = encoder
        self._encoder_config = encoder_config

        has_encoder = bool(encoder or self._encoder_cls)
        if has_encoder:
            logging.warning(
                "TFDS datasets with text encoding are deprecated and will be removed "
                "in a future version. Instead, you should use the plain text version "
                "and tokenize the text using `tensorflow_text` (See: "
                "https://www.tensorflow.org/tutorials/tensorflow_text/intro#tfdata_example)"
            )
        super(Text, self).__init__(
            shape=(None, ) if has_encoder else (),
            dtype=tf.int64 if has_encoder else tf.string,
        )