Beispiel #1
0
    def _tokenize_tensor(self, text):
        """Tokenizes a tensor.

    When not overriden, this default implementation calls the string-based
    tokenization.

    Args:
      text: A 1-D string ``tf.Tensor``.

    Returns:
      A 1-D string ``tf.Tensor``.
    """
        if compat.tf_supports("py_function"):

            def _python_wrapper(string_t):
                string = tf.compat.as_text(string_t.numpy())
                tokens = self._tokenize_string(string)
                return tf.constant(tokens)

            tokens = tf.py_function(_python_wrapper, [text], tf.string)
            tokens.set_shape([None])
            return tokens

        text = tf.py_func(
            lambda x: tf.compat.as_bytes("\0".join(self.tokenize(x))), [text],
            tf.string)
        tokens = tf.string_split([text], delimiter="\0").values
        return tokens
def alignment_matrix_from_pharaoh(alignment_line,
                                  source_length,
                                  target_length,
                                  dtype=tf.float32):
    """Parse Pharaoh alignments into an alignment matrix.

  Args:
    alignment_line: A string ``tf.Tensor`` in the Pharaoh format.
    source_length: The length of the source sentence, without special symbols.
    target_length The length of the target sentence, without special symbols.
    dtype: The output matrix dtype. Defaults to ``tf.float32`` for convenience
      when computing the guided alignment loss.

  Returns:
    The alignment matrix as a 2-D ``tf.Tensor`` of type :obj:`dtype` and shape
    ``[target_length, source_length]``, where ``[i, j] = 1`` if the ``i`` th
    target word is aligned with the ``j`` th source word.
  """
    if compat.tf_supports("strings.split"):
        align_pairs_str = tf.strings.split([alignment_line]).values
        align_pairs_flat_str = tf.strings.split(align_pairs_str,
                                                sep="-").values
    else:
        align_pairs_str = tf.string_split([alignment_line],
                                          delimiter=" ").values
        align_pairs_flat_str = tf.string_split(align_pairs_str,
                                               delimiter="-").values
    align_pairs_flat = compat.tf_compat(v2="strings.to_number",
                                        v1="string_to_number")(
                                            align_pairs_flat_str,
                                            out_type=tf.int64)
    sparse_indices = tf.reshape(align_pairs_flat, [-1, 2])
    sparse_values = tf.ones([tf.shape(sparse_indices)[0]], dtype=dtype)
    source_length = tf.cast(source_length, tf.int64)
    target_length = tf.cast(target_length, tf.int64)
    if compat.tf_supports("sparse.to_dense"):
        alignment_matrix_sparse = tf.sparse.SparseTensor(
            sparse_indices, sparse_values, [source_length, target_length])
        alignment_matrix = tf.sparse.to_dense(alignment_matrix_sparse,
                                              validate_indices=False)
    else:
        alignment_matrix = tf.sparse_to_dense(sparse_indices,
                                              [source_length, target_length],
                                              sparse_values,
                                              validate_indices=False)
    return tf.transpose(alignment_matrix)
Beispiel #3
0
 def encode(self,
            inputs,
            sequence_length=None,
            mode=tf.estimator.ModeKeys.TRAIN):
     outputs = tf.identity(inputs)
     if sequence_length is not None and compat.tf_supports("RaggedTensor"):
         inputs = tf.RaggedTensor.from_tensor(inputs,
                                              lengths=sequence_length)
     state = tf.reduce_mean(inputs, axis=1)
     return (outputs, state, sequence_length)
Beispiel #4
0
def _add_mixed_precision_wrapper(optimizer):
    # TODO: clean mixed precision API when TensorFlow requirement is updated to >=2.4.
    wrapper_class = None
    wrapper_kwargs = {}
    if compat.tf_supports("keras.mixed_precision.LossScaleOptimizer"):
        wrapper_class = tf.keras.mixed_precision.LossScaleOptimizer
    else:
        wrapper_class = tf.keras.mixed_precision.experimental.LossScaleOptimizer
        wrapper_kwargs = dict(loss_scale="dynamic")
    if not isinstance(optimizer, wrapper_class):
        optimizer = wrapper_class(optimizer, **wrapper_kwargs)
    return optimizer
Beispiel #5
0
def get_padded_shapes(dataset):
    """Returns the padded shapes for ``tf.data.Dataset.padded_batch``.

  Args:
    dataset: The dataset that will be batched with padding.

  Returns:
    The same structure as ``dataset.output_shapes`` containing the padded
    shapes.
  """
    if compat.tf_supports("data.get_output_shapes"):
        output_shapes = tf.data.get_output_shapes(dataset)
    else:
        output_shapes = dataset.output_shapes
    return compat.nest.map_structure(lambda shape: shape.as_list(),
                                     output_shapes)
Beispiel #6
0
    def _detokenize_tensor(self, tokens):
        """Detokenizes tokens.

    When not overriden, this default implementation calls the string-based
    detokenization.

    Args:
      tokens: A 1-D ``tf.Tensor``.

    Returns:
      A 0-D string ``tf.Tensor``.
    """
        if compat.tf_supports("py_function"):

            def _python_wrapper(tokens_t):
                tokens = [tf.compat.as_text(s) for s in tokens_t.numpy()]
                string = self._detokenize_string(tokens)
                return tf.constant(string)

            text = tf.py_function(_python_wrapper, [tokens], tf.string)
            text.set_shape([])
            return text
        return tf.py_func(self.detokenize, [tokens], tf.string)
Beispiel #7
0
def skip_if_unsupported(symbol):
  return unittest.skipIf(not compat.tf_supports(symbol), "tf.%s is not supported")
Beispiel #8
0
 def _detokenize_tensor(self, tokens):
     if compat.tf_supports("strings.reduce_join"):
         text = tf.strings.reduce_join(tokens, axis=0)
         return tf.strings.regex_replace(text, "▁", " ")
     else:
         return super(CharacterTokenizer, self)._detokenize_tensor(tokens)
Beispiel #9
0
 def _tokenize_tensor(self, text):
     if compat.tf_supports("strings.unicode_split"):
         text = tf.strings.regex_replace(text, " ", "▁")
         return tf.strings.unicode_split(text, "UTF-8")
     else:
         return super(CharacterTokenizer, self)._tokenize_tensor(text)
Beispiel #10
0
 def _tokenize_tensor(self, text):
     if compat.tf_supports("string.splits"):
         return tf.strings.split([text]).values
     else:
         return tf.string_split([text], delimiter=" ").values
Beispiel #11
0
 def testTFSupports(self):
     self.assertTrue(compat.tf_supports("data"))
     self.assertTrue(compat.tf_supports("data.Dataset"))
     self.assertFalse(compat.tf_supports("data.UnknwonClass"))
     self.assertFalse(compat.tf_supports("unknown_module"))
Beispiel #12
0
 def _group_by_window(*args, **kwargs):
     # TODO: clean this API when TensorFlow requirement is updated to >=2.6.
     if compat.tf_supports("data.Dataset.group_by_window"):
         return lambda dataset: dataset.group_by_window(*args, **kwargs)
     else:
         return tf.data.experimental.group_by_window(*args, **kwargs)