def test_pack_dataset_no_eos(self, use_custom_ops):
        x = [{
            "inputs": [7, 8, 5],
            "targets": [3, 9]
        }, {
            "inputs": [8, 4, 9, 3],
            "targets": [4]
        }]
        ds = create_default_dataset(x)
        packed_ds = dataset.pack_dataset(ds,
                                         length={
                                             "inputs": 8,
                                             "targets": 5
                                         },
                                         use_custom_ops=use_custom_ops)

        # Packing still works without the eos.
        expected = [{
            "inputs": [7, 8, 5, 8, 4, 9, 3, 0],
            "inputs_segmentation": [1, 1, 1, 2, 2, 2, 2, 0],
            "inputs_position": [0, 1, 2, 0, 1, 2, 3, 0],
            "targets": [3, 9, 4, 0, 0],
            "targets_position": [0, 1, 0, 0, 0],
            "targets_segmentation": [1, 1, 2, 0, 0],
        }]
        self.assert_dataset(packed_ds, expected, {
            "inputs": tf.int32,
            "targets": tf.int32
        })
    def test_pack_dataset(self, use_custom_ops):
        x = [{
            "inputs": [7, 8, 5, 1],
            "targets": [3, 9, 1],
            "idx": [0]
        }, {
            "inputs": [8, 4, 9, 3, 1],
            "targets": [4, 1],
            "idx": [1]
        }]
        ds = create_default_dataset(x,
                                    feature_names=("inputs", "targets", "idx"))
        packed_ds = dataset.pack_dataset(ds,
                                         length={
                                             "inputs": 10,
                                             "targets": 7
                                         },
                                         keys=("inputs", "targets"),
                                         use_custom_ops=use_custom_ops)

        expected = [{
            "inputs": [7, 8, 5, 1, 8, 4, 9, 3, 1, 0],
            "inputs_segmentation": [1, 1, 1, 1, 2, 2, 2, 2, 2, 0],
            "inputs_position": [0, 1, 2, 3, 0, 1, 2, 3, 4, 0],
            "targets": [3, 9, 1, 4, 1, 0, 0],
            "targets_position": [0, 1, 2, 0, 1, 0, 0],
            "targets_segmentation": [1, 1, 1, 2, 2, 0, 0],
        }]
        self.assert_dataset(packed_ds, expected, {
            "inputs": tf.int32,
            "targets": tf.int32
        })
Beispiel #3
0
def pack_or_pad_ll(dataset,
                   length,
                   pack=True,
                   feature_keys=None,
                   ensure_eos=False,
                   shift_decoder_output=False,
                   target_prefix_attributes=None,
                   tokenizer=None):
    """Creates a 'packed' version of a dataset or pads examples with zeros.
  If pack=True, then multiple examples concatenated to form one combined
  example with the given length.
  If pack=False, then examples are padded with zeros to 'length'.
  Args:
    dataset: a tf.data.Dataset
    length: an integer or a dict from feature-key to integer
    pack: a boolean, whether to pack (True) or pad (False).
    feature_keys: (optional) list of strings, the feature names to limit
      packing or padding to. Packing will filter out other features whereas
      padding will pass them through unchanged. Defaults to all features.
    ensure_eos: a boolean, whether to replace the final token with EOS=1 if it
      is not PAD=0.
  Returns:
    a tf.data.Dataset where all features have fixed shape [length].
  """
    feature_keys = feature_keys or list(dataset.output_shapes.keys())
    if shift_decoder_output:
        left_pad_amts = [
            len(tokenizer.encode(target_prefix_attribute)) - 1
            for target_prefix_attribute in target_prefix_attributes
        ]
        dataset = shift_decoder_output_fn(dataset,
                                          left_pad_amts=left_pad_amts,
                                          feature_keys=feature_keys)
    if pack:
        dataset = pack_dataset(dataset, length=length, keys=feature_keys)
    # Pad/trim length of each example to length.
    dataset = trim_and_pad_dataset(dataset,
                                   length=length,
                                   feature_keys=feature_keys)
    if ensure_eos:
        dataset = ensure_dataset_eos_ll(dataset, feature_keys)
    return dataset
 def test_pack_dataset_long_seq(self, use_custom_ops):
     x = [{
         "inputs": [7, 8, 5, 6, 9, 4, 1],
         "targets": [3, 9, 1]
     }, {
         "inputs": [8, 4, 9, 3, 5, 7, 9, 1],
         "targets": [4, 1]
     }]
     ds = create_default_dataset(x)
     packed_ds = dataset.pack_dataset(ds,
                                      length={
                                          "inputs": 7,
                                          "targets": 3
                                      },
                                      use_custom_ops=use_custom_ops)
     expected = [
         {
             "inputs": [7, 8, 5, 6, 9, 4, 1],
             "inputs_segmentation": [1, 1, 1, 1, 1, 1, 1],
             "inputs_position": [0, 1, 2, 3, 4, 5, 6],
             "targets": [3, 9, 1],
             "targets_position": [0, 1, 2],
             "targets_segmentation": [1, 1, 1],
         },
         {
             # EOS is trimmed
             "inputs": [8, 4, 9, 3, 5, 7, 9],
             "inputs_segmentation": [1, 1, 1, 1, 1, 1, 1],
             "inputs_position": [0, 1, 2, 3, 4, 5, 6],
             "targets": [4, 1, 0],
             "targets_position": [0, 1, 0],
             "targets_segmentation": [1, 1, 0],
         }
     ]
     self.assert_dataset(packed_ds, expected, {
         "inputs": tf.int32,
         "targets": tf.int32
     })