def test_pack_dataset_no_eos(self, use_custom_ops): x = [{ "inputs": [7, 8, 5], "targets": [3, 9] }, { "inputs": [8, 4, 9, 3], "targets": [4] }] ds = create_default_dataset(x) packed_ds = dataset.pack_dataset(ds, length={ "inputs": 8, "targets": 5 }, use_custom_ops=use_custom_ops) # Packing still works without the eos. expected = [{ "inputs": [7, 8, 5, 8, 4, 9, 3, 0], "inputs_segmentation": [1, 1, 1, 2, 2, 2, 2, 0], "inputs_position": [0, 1, 2, 0, 1, 2, 3, 0], "targets": [3, 9, 4, 0, 0], "targets_position": [0, 1, 0, 0, 0], "targets_segmentation": [1, 1, 2, 0, 0], }] self.assert_dataset(packed_ds, expected, { "inputs": tf.int32, "targets": tf.int32 })
def test_pack_dataset(self, use_custom_ops): x = [{ "inputs": [7, 8, 5, 1], "targets": [3, 9, 1], "idx": [0] }, { "inputs": [8, 4, 9, 3, 1], "targets": [4, 1], "idx": [1] }] ds = create_default_dataset(x, feature_names=("inputs", "targets", "idx")) packed_ds = dataset.pack_dataset(ds, length={ "inputs": 10, "targets": 7 }, keys=("inputs", "targets"), use_custom_ops=use_custom_ops) expected = [{ "inputs": [7, 8, 5, 1, 8, 4, 9, 3, 1, 0], "inputs_segmentation": [1, 1, 1, 1, 2, 2, 2, 2, 2, 0], "inputs_position": [0, 1, 2, 3, 0, 1, 2, 3, 4, 0], "targets": [3, 9, 1, 4, 1, 0, 0], "targets_position": [0, 1, 2, 0, 1, 0, 0], "targets_segmentation": [1, 1, 1, 2, 2, 0, 0], }] self.assert_dataset(packed_ds, expected, { "inputs": tf.int32, "targets": tf.int32 })
def pack_or_pad_ll(dataset, length, pack=True, feature_keys=None, ensure_eos=False, shift_decoder_output=False, target_prefix_attributes=None, tokenizer=None): """Creates a 'packed' version of a dataset or pads examples with zeros. If pack=True, then multiple examples concatenated to form one combined example with the given length. If pack=False, then examples are padded with zeros to 'length'. Args: dataset: a tf.data.Dataset length: an integer or a dict from feature-key to integer pack: a boolean, whether to pack (True) or pad (False). feature_keys: (optional) list of strings, the feature names to limit packing or padding to. Packing will filter out other features whereas padding will pass them through unchanged. Defaults to all features. ensure_eos: a boolean, whether to replace the final token with EOS=1 if it is not PAD=0. Returns: a tf.data.Dataset where all features have fixed shape [length]. """ feature_keys = feature_keys or list(dataset.output_shapes.keys()) if shift_decoder_output: left_pad_amts = [ len(tokenizer.encode(target_prefix_attribute)) - 1 for target_prefix_attribute in target_prefix_attributes ] dataset = shift_decoder_output_fn(dataset, left_pad_amts=left_pad_amts, feature_keys=feature_keys) if pack: dataset = pack_dataset(dataset, length=length, keys=feature_keys) # Pad/trim length of each example to length. dataset = trim_and_pad_dataset(dataset, length=length, feature_keys=feature_keys) if ensure_eos: dataset = ensure_dataset_eos_ll(dataset, feature_keys) return dataset
def test_pack_dataset_long_seq(self, use_custom_ops): x = [{ "inputs": [7, 8, 5, 6, 9, 4, 1], "targets": [3, 9, 1] }, { "inputs": [8, 4, 9, 3, 5, 7, 9, 1], "targets": [4, 1] }] ds = create_default_dataset(x) packed_ds = dataset.pack_dataset(ds, length={ "inputs": 7, "targets": 3 }, use_custom_ops=use_custom_ops) expected = [ { "inputs": [7, 8, 5, 6, 9, 4, 1], "inputs_segmentation": [1, 1, 1, 1, 1, 1, 1], "inputs_position": [0, 1, 2, 3, 4, 5, 6], "targets": [3, 9, 1], "targets_position": [0, 1, 2], "targets_segmentation": [1, 1, 1], }, { # EOS is trimmed "inputs": [8, 4, 9, 3, 5, 7, 9], "inputs_segmentation": [1, 1, 1, 1, 1, 1, 1], "inputs_position": [0, 1, 2, 3, 4, 5, 6], "targets": [4, 1, 0], "targets_position": [0, 1, 0], "targets_segmentation": [1, 1, 0], } ] self.assert_dataset(packed_ds, expected, { "inputs": tf.int32, "targets": tf.int32 })