def testApplyPackingErrors(self): test_cases = { 'out of bound': ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1, 1, 1], [1, 0, 0]], [[1, 1, 1], [0, 0, 0]]), 'out of bound ': ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1, 1], [1, 0]], [[0, 0], [2, 0]]), 'out of bound.': ApplyPackingTestCase(['a', 'b'], ',', [[1, 2]], [[1, 2]]), 'segment_ids and indices_in_input must be matrices': ApplyPackingTestCase([[0, 1], [2, 3]], 0, [1, 1], [0, 0]), 'segment_ids and indices_in_input must be matrices of the same shape': ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1, 1], [1, 0]], [[0, 0], [0, 0], [0, 0]]), 'input must be a matrix or vector': ApplyPackingTestCase([[[0, 1]]], 0, [[1]], [[0]]), 'padding must be a scalar': ApplyPackingTestCase([[0, 1], [2, 3]], [-1], [[1]], [[0]]), } for name, test in test_cases.items(): with self.assertRaisesRegex(tf.errors.InvalidArgumentError, name): with self.session(): ops.apply_packing(test.input, test.padding, test.segment_ids, test.indices_in_input).eval()
def _Pack(self, batch_in): """Packs a given batch, which changes the batch size.""" actual_seq_len = tf.math.reduce_sum(tf.cast(batch_in.segment_ids, tf.int32), axis=1) (segment_ids, segment_pos, indices_in_input, _, _, _) = ops.pack_sequences( actual_seq_len, actual_seq_len, packed_batch_size=0, packed_src_seq_len=self.params.max_sequence_length, packed_tgt_seq_len=self.params.max_sequence_length) def ApplyPacking(x): return ops.apply_packing(x, 0, segment_ids, indices_in_input) batch_out = batch_in.DeepCopy() batch_out = batch_out.Transform(ApplyPacking) batch_out.paddings = ops.apply_packing(batch_in.paddings, 1, segment_ids, indices_in_input) batch_out.segment_ids = tf.cast(segment_ids, tf.float32) batch_out.segment_pos = segment_pos return batch_out
def testApplyPackingStrings(self): test_cases = { 'Basic': ApplyPackingTestCase(['a', 'b'], ',', [[1, 1]], [[1, 0]], [b'b,a']), 'Repeated': ApplyPackingTestCase(['a', 'b'], ',', [[1, 1, 1]], [[1, 1, 1]], [b'b']), 'Separator': ApplyPackingTestCase(['a', 'b', 'c', 'd'], '=', [[1, 1, 1, 0]], [[1, 0, 3, 2]], [b'b=a=d']), 'MultiRows': ApplyPackingTestCase(['a', 'b', 'c', 'd'], ';', [[1, 1, 1, 0], [0, 1, 1, 1]], [[2, 2, 1, 0], [2, 0, 1, 1]], [b'c;b', b'a;b']), 'SingleString': ApplyPackingTestCase(['a', 'b', 'c', 'd'], ',', [[0, 0, 1], [0, 1, 0]], [[0, 1, 2], [0, 1, 2]], [b'c', b'b']), 'EmptyRow': ApplyPackingTestCase(['a', 'b', 'c', 'd'], ',', [[0, 0, 0], [1, 1, 1]], [[0, 1, 2], [0, 0, 2]], [b'', b'a,c']), } for name, test in test_cases.items(): with self.session(): output = ops.apply_packing( tf.constant(test.input, tf.string), tf.constant(test.padding, tf.string), tf.constant(test.segment_ids, tf.int32), tf.constant(test.indices_in_input, tf.int32)).eval() self.assertAllEqual(output, test.output, name)
def testApplyPackingSum(self): test_cases = { 'Basic': ApplyPackingTestCase(np.arange(10), 0, [[1, 1], [1, 1]], [[1, 1], [5, 5]], [1, 5]), 'Padding': ApplyPackingTestCase(np.arange(10), 0, [[1, 1, 0], [1, 1, 0]], [[1, 1, 0], [3, 3, 0]], [1, 3]), 'Tiny': ApplyPackingTestCase(np.arange(10), 0, [[1], [1]], [[3], [1]], [3, 1]), 'Larger': ApplyPackingTestCase(np.arange(10), 0, [[1, 1, 2, 2], [0, 1, 2, 3], [0, 1, 1, 1]], [[2, 2, 3, 3], [9, 4, 5, 6], [9, 8, 8, 8]], [5, 15, 8]), } for name, test in test_cases.items(): for dtype in [ tf.int32, tf.int64, tf.float32, tf.float64, tf.uint32, tf.uint64 ]: with self.session(): output = ops.apply_packing( tf.constant(test.input, dtype), tf.constant(test.padding, dtype), tf.constant(test.segment_ids, tf.int32), tf.constant(test.indices_in_input, tf.int32)).eval() expected = tf.constant(test.output, dtype).eval() self.assertAllEqual(output, expected, f'{name} {dtype}')
def testApplyPackingUnknownShape(self): x = tf.compat.v1.placeholder(tf.int32, shape=[None, None]) self.assertAllEqual(x.shape.as_list(), [None, None]) with self.session(): x_val = np.array([[0, 1], [2, 3]]) output = ops.apply_packing( x, 0, tf.constant([[1, 1], [1, 1]], tf.int32), tf.constant([[1, 1], [0, 0]], tf.int32)).eval(feed_dict={x: x_val}) self.assertAllEqual(output, [[2, 3], [0, 1]])
def testApplyPackingTypes(self): test = ApplyPackingTestCase([[0, 1], [2, 3]], 99, [[1, 1, 0], [1, 1, 0]], [[1, 1, 0], [0, 0, 0]], [[2, 3, 99], [0, 1, 99]]) for dtype in [ tf.int32, tf.int64, tf.float32, tf.float64, tf.uint32, tf.uint64 ]: with self.session(): output = ops.apply_packing( tf.constant(test.input, dtype), tf.constant(test.padding, dtype), tf.constant(test.segment_ids, tf.int32), tf.constant(test.indices_in_input, tf.int32)).eval() expected = tf.constant(test.output, dtype).eval() self.assertAllEqual(output, expected, dtype)
def testApplyPacking(self): test_cases = { 'Basic': ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1, 1], [1, 1]], [[1, 1], [0, 0]], [[2, 3], [0, 1]]), 'Padding': ApplyPackingTestCase([[0, 1], [2, 3]], -1, [[1, 1, 0], [1, 1, 0]], [[1, 1, 0], [0, 0, 0]], [[2, 3, -1], [0, 1, -1]]), 'Tiny': ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1], [1]], [[0], [1]], [[0], [2]]), '5x2 input to 2x4': ApplyPackingTestCase(np.reshape(np.arange(10), [5, 2]), -1, [[1, 1, 2, 2], [0, 1, 0, 2]], [[2, 2, 3, 3], [0, 4, 0, 0]], [[4, 5, 6, 7], [-1, 8, -1, 0]]), '6x4 input to 3x4': ApplyPackingTestCase( np.reshape(np.arange(24), [6, 4]), -1, [[0, 0, 0, 1], [1, 1, 1, 1], [1, 2, 3, 4]], [[0, 0, 0, 0], [1, 1, 1, 1], [2, 3, 4, 5]], [[-1, -1, -1, 0], [4, 5, 6, 7], [8, 12, 16, 20]]), '6x4 input to 3x5': ApplyPackingTestCase( np.reshape(np.arange(24), [6, 4]), -1, [[0, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 0, 2]], [[0, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 0, 0, 0, 3]], [[-1, -1, -1, -1, -1], [-1, 4, 5, -1, -1], [-1, 0, 1, -1, 12] ]), '100x4 input to 3x5': ApplyPackingTestCase( np.reshape(np.arange(400), [100, 4]), -1, [[1, 1, 1, 2, 2], [0, 1, 1, 1, 1], [2, 2, 0, 0, 3]], [[99, 99, 99, 1, 1], [0, 50, 50, 50, 50], [90, 90, 0, 0, 3]], [[396, 397, 398, 4, 5], [-1, 200, 201, 202, 203], [360, 361, -1, -1, 12]]), } for name, test in test_cases.items(): with self.session(): output = ops.apply_packing( tf.constant(test.input, tf.int32), tf.constant(test.padding, tf.int32), tf.constant(test.segment_ids, tf.int32), tf.constant(test.indices_in_input, tf.int32)).eval() self.assertAllEqual(output, test.output, name)
def ApplyPacking(x): return ops.apply_packing(x, 0, segment_ids, indices_in_input)
def ApplyPackingToTarget(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', tgt_segment_ids, tgt_indices_in_input) return ops.apply_packing(x, 0, tgt_segment_ids, tgt_indices_in_input)
def ApplyPackingToSource(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', src_segment_ids, src_indices_in_input) return ops.apply_packing(x, 0, src_segment_ids, src_indices_in_input)
def _ApplyPacking(self, batch): """Packs a given batch. Note that this may change the batch size. This function packs the input batch and adds .segment_ids and .segment_pos fields to its `src` and `tgt` fields. Args: batch: a `.NestedMap` of input tensors to be packed. It is modified in place. """ src_actual_seq_len = tf.math.reduce_sum(tf.cast( batch.src.ids_indicator, tf.int32), axis=1) tgt_actual_seq_len = tf.math.reduce_sum(tf.cast( batch.tgt.ids_indicator, tf.int32), axis=1) summary_utils.histogram('source_seq_lengths', src_actual_seq_len) summary_utils.histogram('target_seq_lengths', tgt_actual_seq_len) if not self.params.packing_factor: # Supply segment_ids and segment_pos with no packing. batch.src.segment_ids = batch.src.ids_indicator batch.src.segment_pos = _GetSegmentPos(batch.src.ids_indicator) batch.tgt.segment_ids = batch.tgt.ids_indicator batch.tgt.segment_pos = _GetSegmentPos(batch.tgt.ids_indicator) return (src_segment_ids, src_segment_pos, src_indices_in_input, tgt_segment_ids, tgt_segment_pos, tgt_indices_in_input) = ops.pack_sequences( src_actual_seq_len, tgt_actual_seq_len, self._ScaledBatchSize(), self.params.source_max_length, self.params.target_max_length) uniq_src_indices_in_input = tf.unique( tf.reshape(src_indices_in_input, [-1])).y uniq_tgt_indices_in_input = tf.unique( tf.reshape(tgt_indices_in_input, [-1])).y summary_utils.histogram( 'packed_source_seq_lengths', tf.gather(src_actual_seq_len, uniq_src_indices_in_input, axis=0)) summary_utils.histogram( 'packed_target_seq_lengths', tf.gather(tgt_actual_seq_len, uniq_tgt_indices_in_input, axis=0)) # Ratio of number of non-padded tokens. If < 1.0, we are dropping # input data due to p.packing_factor too high. src_orig_tokens_count = tf.cast(tf.reduce_sum(src_actual_seq_len), tf.float32) src_packed_tokens_count = tf.reduce_sum( tf.cast(src_segment_ids > 0, tf.float32)) summary_utils.scalar('examples/src_packed_token_ratio', src_packed_tokens_count / src_orig_tokens_count) tgt_orig_tokens_count = tf.cast(tf.reduce_sum(tgt_actual_seq_len), tf.float32) tgt_packed_tokens_count = tf.reduce_sum( tf.cast(tgt_segment_ids > 0, tf.float32)) summary_utils.scalar('examples/tgt_packed_token_ratio', tgt_packed_tokens_count / tgt_orig_tokens_count) # We deferred adding .paddings and use its complement .ids_indicator # exclusively so that we can apply the packing with padding set to 0 for all # fields. def ApplyPackingToSource(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', src_segment_ids, src_indices_in_input) return ops.apply_packing(x, 0, src_segment_ids, src_indices_in_input) src_paddings = ops.apply_packing(batch.src.paddings, 1, src_segment_ids, src_indices_in_input) batch.src = batch.src.Transform(ApplyPackingToSource) batch.src.paddings = src_paddings batch.src.segment_ids = tf.cast(src_segment_ids, tf.float32) batch.src.segment_pos = src_segment_pos def ApplyPackingToTarget(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', tgt_segment_ids, tgt_indices_in_input) return ops.apply_packing(x, 0, tgt_segment_ids, tgt_indices_in_input) tgt_paddings = ops.apply_packing(batch.tgt.paddings, 1, tgt_segment_ids, tgt_indices_in_input) batch.tgt = batch.tgt.Transform(ApplyPackingToTarget) batch.tgt.paddings = tgt_paddings batch.tgt.segment_ids = tf.cast(tgt_segment_ids, tf.float32) batch.tgt.segment_pos = tgt_segment_pos # The number of examples is indicated by the segment_ids of the target. num_segments = tf.math.reduce_max(batch.tgt.segment_ids, axis=1) num_examples = tf.reduce_sum(num_segments) # Note that this is per infeed value when p.use_per_host_infeed = True. metric_name = 'examples/num_packed_examples' summary_utils.scalar(metric_name, num_examples)
def _Pack(self, batch): """Packs a given batch. Note that this may change the batch size. This function packs the input batch and adds .segment_ids and .segment_pos fields to its `src` and `tgt` fields. Args: batch: a `.NestedMap` of input tensors to be packed. It is modified in place. """ src_actual_seq_len = tf.math.reduce_sum(tf.cast( batch.src.ids_indicator, tf.int32), axis=1) tgt_actual_seq_len = tf.math.reduce_sum(tf.cast( batch.tgt.ids_indicator, tf.int32), axis=1) summary_utils.histogram('source_seq_lengths', src_actual_seq_len) summary_utils.histogram('target_seq_lengths', tgt_actual_seq_len) if not self.params.packing_factor: # Supply segment_ids and segment_pos with no packing. batch.src.segment_ids = batch.src.ids_indicator batch.src.segment_pos = _GetSegmentPos(batch.src.ids_indicator) batch.tgt.segment_ids = batch.tgt.ids_indicator batch.tgt.segment_pos = _GetSegmentPos(batch.tgt.ids_indicator) return (src_segment_ids, src_segment_pos, src_indices_in_input, tgt_segment_ids, tgt_segment_pos, tgt_indices_in_input) = ops.pack_sequences( src_actual_seq_len, tgt_actual_seq_len, self._ScaledBatchSize(), self.params.source_max_length, self.params.target_max_length) uniq_src_indices_in_input = tf.unique( tf.reshape(src_indices_in_input, [-1])).y uniq_tgt_indices_in_input = tf.unique( tf.reshape(tgt_indices_in_input, [-1])).y summary_utils.histogram( 'packed_source_seq_lengths', tf.gather(src_actual_seq_len, uniq_src_indices_in_input, axis=0)) summary_utils.histogram( 'packed_target_seq_lengths', tf.gather(tgt_actual_seq_len, uniq_tgt_indices_in_input, axis=0)) # We deferred adding .paddings and use its complement .ids_indicator # exclusively so that we can apply the packing with padding set to 0 for all # fields. def ApplyPackingToSource(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', src_segment_ids, src_indices_in_input) return ops.apply_packing(x, 0, src_segment_ids, src_indices_in_input) src_paddings = ops.apply_packing(batch.src.paddings, 1, src_segment_ids, src_indices_in_input) batch.src = batch.src.Transform(ApplyPackingToSource) batch.src.paddings = src_paddings batch.src.segment_ids = tf.cast(src_segment_ids, tf.float32) batch.src.segment_pos = src_segment_pos def ApplyPackingToTarget(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', tgt_segment_ids, tgt_indices_in_input) return ops.apply_packing(x, 0, tgt_segment_ids, tgt_indices_in_input) tgt_paddings = ops.apply_packing(batch.tgt.paddings, 1, tgt_segment_ids, tgt_indices_in_input) batch.tgt = batch.tgt.Transform(ApplyPackingToTarget) batch.tgt.paddings = tgt_paddings batch.tgt.segment_ids = tf.cast(tgt_segment_ids, tf.float32) batch.tgt.segment_pos = tgt_segment_pos
def testApplyPackingErrors(self, expected_error_type, expected_error, test): with self.assertRaisesRegex(expected_error_type, expected_error): with self.session(): ops.apply_packing(test.input, test.padding, test.segment_ids, test.indices_in_input).eval()
def testPackSingleSequence(self, input_lengths, max_packed_length, require_sequential_order, expected_packed_idxs): with self.session() as sess: np.random.seed(12345) segment_ids, indices_in_input = sess.run( ops.pack_single_sequence( input_lengths=input_lengths, max_packed_length=max_packed_length, require_sequential_order=require_sequential_order)) self.assertLen(expected_packed_idxs, segment_ids.shape[0]) # Test the output is compatible with apply_packing. inputs = [] for i, length in enumerate(input_lengths): inputs.append( np.random.randint(100000, size=[length, 2, 2], dtype=np.int32)) outputs = sess.run( ops.apply_packing(input=tf.stack([ tf.pad( x, [[0, max_packed_length - x.shape[0]], [0, 0], [0, 0]]) for x in inputs ]), padding=0, segment_ids=segment_ids, indices_in_input=indices_in_input)) for segment_id, idxs, output, expected_idxs in zip( segment_ids, indices_in_input, outputs, expected_packed_idxs): # Build the expected results from the provided expected_packed_idxs. expected_segment_ids = [] expected_idxs_vec = [] expected_outputs = [] for i, idx in enumerate(expected_idxs): expected_segment_ids += [i + 1] * input_lengths[idx] expected_idxs_vec += [idx] * input_lengths[idx] expected_outputs.append(inputs[idx]) expected_outputs = np.concatenate(expected_outputs) expected_packed_length = len(expected_outputs) self.assertLessEqual(expected_packed_length, max_packed_length) self.assertLen(expected_segment_ids, expected_packed_length) self.assertLen(expected_idxs_vec, expected_packed_length) # Check indices_in_input is non-decreasing. if expected_packed_length > 1: self.assertAllGreaterEqual( idxs[1:expected_packed_length] - idxs[:expected_packed_length - 1], 0) # Pad to max_packed_length. pad_len = max_packed_length - expected_packed_length expected_segment_ids += [0] * pad_len expected_idxs_vec += [-1] * pad_len expected_outputs = np.pad(expected_outputs, [(0, pad_len), (0, 0), (0, 0)], mode='constant') self.assertAllEqual(expected_idxs_vec, idxs) self.assertAllEqual(expected_segment_ids, segment_id) self.assertAllEqual(expected_outputs, output)