Beispiel #1
0
 def testApplyPackingErrors(self):
     test_cases = {
         'out of bound':
         ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1, 1, 1], [1, 0, 0]],
                              [[1, 1, 1], [0, 0, 0]]),
         'out of bound ':
         ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1, 1], [1, 0]],
                              [[0, 0], [2, 0]]),
         'out of bound.':
         ApplyPackingTestCase(['a', 'b'], ',', [[1, 2]], [[1, 2]]),
         'segment_ids and indices_in_input must be matrices':
         ApplyPackingTestCase([[0, 1], [2, 3]], 0, [1, 1], [0, 0]),
         'segment_ids and indices_in_input must be matrices of the same shape':
         ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1, 1], [1, 0]],
                              [[0, 0], [0, 0], [0, 0]]),
         'input must be a matrix or vector':
         ApplyPackingTestCase([[[0, 1]]], 0, [[1]], [[0]]),
         'padding must be a scalar':
         ApplyPackingTestCase([[0, 1], [2, 3]], [-1], [[1]], [[0]]),
     }
     for name, test in test_cases.items():
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError, name):
             with self.session():
                 ops.apply_packing(test.input, test.padding,
                                   test.segment_ids,
                                   test.indices_in_input).eval()
Beispiel #2
0
    def _Pack(self, batch_in):
        """Packs a given batch, which changes the batch size."""

        actual_seq_len = tf.math.reduce_sum(tf.cast(batch_in.segment_ids,
                                                    tf.int32),
                                            axis=1)
        (segment_ids, segment_pos, indices_in_input, _, _,
         _) = ops.pack_sequences(
             actual_seq_len,
             actual_seq_len,
             packed_batch_size=0,
             packed_src_seq_len=self.params.max_sequence_length,
             packed_tgt_seq_len=self.params.max_sequence_length)

        def ApplyPacking(x):
            return ops.apply_packing(x, 0, segment_ids, indices_in_input)

        batch_out = batch_in.DeepCopy()
        batch_out = batch_out.Transform(ApplyPacking)
        batch_out.paddings = ops.apply_packing(batch_in.paddings, 1,
                                               segment_ids, indices_in_input)
        batch_out.segment_ids = tf.cast(segment_ids, tf.float32)
        batch_out.segment_pos = segment_pos

        return batch_out
Beispiel #3
0
 def testApplyPackingStrings(self):
     test_cases = {
         'Basic':
         ApplyPackingTestCase(['a', 'b'], ',', [[1, 1]], [[1, 0]],
                              [b'b,a']),
         'Repeated':
         ApplyPackingTestCase(['a', 'b'], ',', [[1, 1, 1]], [[1, 1, 1]],
                              [b'b']),
         'Separator':
         ApplyPackingTestCase(['a', 'b', 'c', 'd'], '=', [[1, 1, 1, 0]],
                              [[1, 0, 3, 2]], [b'b=a=d']),
         'MultiRows':
         ApplyPackingTestCase(['a', 'b', 'c', 'd'], ';',
                              [[1, 1, 1, 0], [0, 1, 1, 1]],
                              [[2, 2, 1, 0], [2, 0, 1, 1]],
                              [b'c;b', b'a;b']),
         'SingleString':
         ApplyPackingTestCase(['a', 'b', 'c', 'd'], ',',
                              [[0, 0, 1], [0, 1, 0]],
                              [[0, 1, 2], [0, 1, 2]], [b'c', b'b']),
         'EmptyRow':
         ApplyPackingTestCase(['a', 'b', 'c', 'd'], ',',
                              [[0, 0, 0], [1, 1, 1]],
                              [[0, 1, 2], [0, 0, 2]], [b'', b'a,c']),
     }
     for name, test in test_cases.items():
         with self.session():
             output = ops.apply_packing(
                 tf.constant(test.input, tf.string),
                 tf.constant(test.padding, tf.string),
                 tf.constant(test.segment_ids, tf.int32),
                 tf.constant(test.indices_in_input, tf.int32)).eval()
             self.assertAllEqual(output, test.output, name)
Beispiel #4
0
 def testApplyPackingSum(self):
     test_cases = {
         'Basic':
         ApplyPackingTestCase(np.arange(10), 0, [[1, 1], [1, 1]],
                              [[1, 1], [5, 5]], [1, 5]),
         'Padding':
         ApplyPackingTestCase(np.arange(10), 0, [[1, 1, 0], [1, 1, 0]],
                              [[1, 1, 0], [3, 3, 0]], [1, 3]),
         'Tiny':
         ApplyPackingTestCase(np.arange(10), 0, [[1], [1]], [[3], [1]],
                              [3, 1]),
         'Larger':
         ApplyPackingTestCase(np.arange(10), 0,
                              [[1, 1, 2, 2], [0, 1, 2, 3], [0, 1, 1, 1]],
                              [[2, 2, 3, 3], [9, 4, 5, 6], [9, 8, 8, 8]],
                              [5, 15, 8]),
     }
     for name, test in test_cases.items():
         for dtype in [
                 tf.int32, tf.int64, tf.float32, tf.float64, tf.uint32,
                 tf.uint64
         ]:
             with self.session():
                 output = ops.apply_packing(
                     tf.constant(test.input, dtype),
                     tf.constant(test.padding, dtype),
                     tf.constant(test.segment_ids, tf.int32),
                     tf.constant(test.indices_in_input, tf.int32)).eval()
                 expected = tf.constant(test.output, dtype).eval()
                 self.assertAllEqual(output, expected, f'{name} {dtype}')
Beispiel #5
0
 def testApplyPackingUnknownShape(self):
   x = tf.compat.v1.placeholder(tf.int32, shape=[None, None])
   self.assertAllEqual(x.shape.as_list(), [None, None])
   with self.session():
     x_val = np.array([[0, 1], [2, 3]])
     output = ops.apply_packing(
         x, 0, tf.constant([[1, 1], [1, 1]], tf.int32),
         tf.constant([[1, 1], [0, 0]], tf.int32)).eval(feed_dict={x: x_val})
   self.assertAllEqual(output, [[2, 3], [0, 1]])
Beispiel #6
0
 def testApplyPackingTypes(self):
   test = ApplyPackingTestCase([[0, 1], [2, 3]], 99, [[1, 1, 0], [1, 1, 0]],
                               [[1, 1, 0], [0, 0, 0]],
                               [[2, 3, 99], [0, 1, 99]])
   for dtype in [
       tf.int32, tf.int64, tf.float32, tf.float64, tf.uint32, tf.uint64
   ]:
     with self.session():
       output = ops.apply_packing(
           tf.constant(test.input, dtype), tf.constant(test.padding, dtype),
           tf.constant(test.segment_ids, tf.int32),
           tf.constant(test.indices_in_input, tf.int32)).eval()
       expected = tf.constant(test.output, dtype).eval()
       self.assertAllEqual(output, expected, dtype)
Beispiel #7
0
 def testApplyPacking(self):
     test_cases = {
         'Basic':
         ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1, 1], [1, 1]],
                              [[1, 1], [0, 0]], [[2, 3], [0, 1]]),
         'Padding':
         ApplyPackingTestCase([[0, 1], [2, 3]], -1, [[1, 1, 0], [1, 1, 0]],
                              [[1, 1, 0], [0, 0, 0]],
                              [[2, 3, -1], [0, 1, -1]]),
         'Tiny':
         ApplyPackingTestCase([[0, 1], [2, 3]], 0, [[1], [1]], [[0], [1]],
                              [[0], [2]]),
         '5x2 input to 2x4':
         ApplyPackingTestCase(np.reshape(np.arange(10), [5, 2]), -1,
                              [[1, 1, 2, 2], [0, 1, 0, 2]],
                              [[2, 2, 3, 3], [0, 4, 0, 0]],
                              [[4, 5, 6, 7], [-1, 8, -1, 0]]),
         '6x4 input to 3x4':
         ApplyPackingTestCase(
             np.reshape(np.arange(24), [6, 4]), -1,
             [[0, 0, 0, 1], [1, 1, 1, 1], [1, 2, 3, 4]],
             [[0, 0, 0, 0], [1, 1, 1, 1], [2, 3, 4, 5]],
             [[-1, -1, -1, 0], [4, 5, 6, 7], [8, 12, 16, 20]]),
         '6x4 input to 3x5':
         ApplyPackingTestCase(
             np.reshape(np.arange(24), [6, 4]), -1,
             [[0, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 0, 2]],
             [[0, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 0, 0, 0, 3]],
             [[-1, -1, -1, -1, -1], [-1, 4, 5, -1, -1], [-1, 0, 1, -1, 12]
              ]),
         '100x4 input to 3x5':
         ApplyPackingTestCase(
             np.reshape(np.arange(400), [100, 4]), -1,
             [[1, 1, 1, 2, 2], [0, 1, 1, 1, 1], [2, 2, 0, 0, 3]],
             [[99, 99, 99, 1, 1], [0, 50, 50, 50, 50], [90, 90, 0, 0, 3]],
             [[396, 397, 398, 4, 5], [-1, 200, 201, 202, 203],
              [360, 361, -1, -1, 12]]),
     }
     for name, test in test_cases.items():
         with self.session():
             output = ops.apply_packing(
                 tf.constant(test.input, tf.int32),
                 tf.constant(test.padding, tf.int32),
                 tf.constant(test.segment_ids, tf.int32),
                 tf.constant(test.indices_in_input, tf.int32)).eval()
             self.assertAllEqual(output, test.output, name)
Beispiel #8
0
 def ApplyPacking(x):
     return ops.apply_packing(x, 0, segment_ids, indices_in_input)
Beispiel #9
0
 def ApplyPackingToTarget(x):
     if x.dtype == tf.string:
         return ops.apply_packing(x, '\t', tgt_segment_ids,
                                  tgt_indices_in_input)
     return ops.apply_packing(x, 0, tgt_segment_ids,
                              tgt_indices_in_input)
Beispiel #10
0
 def ApplyPackingToSource(x):
     if x.dtype == tf.string:
         return ops.apply_packing(x, '\t', src_segment_ids,
                                  src_indices_in_input)
     return ops.apply_packing(x, 0, src_segment_ids,
                              src_indices_in_input)
Beispiel #11
0
    def _ApplyPacking(self, batch):
        """Packs a given batch.

    Note that this may change the batch size.

    This function packs the input batch and adds .segment_ids and .segment_pos
    fields to its `src` and `tgt` fields.

    Args:
      batch: a `.NestedMap` of input tensors to be packed. It is modified in
        place.
    """
        src_actual_seq_len = tf.math.reduce_sum(tf.cast(
            batch.src.ids_indicator, tf.int32),
                                                axis=1)
        tgt_actual_seq_len = tf.math.reduce_sum(tf.cast(
            batch.tgt.ids_indicator, tf.int32),
                                                axis=1)
        summary_utils.histogram('source_seq_lengths', src_actual_seq_len)
        summary_utils.histogram('target_seq_lengths', tgt_actual_seq_len)

        if not self.params.packing_factor:
            # Supply segment_ids and segment_pos with no packing.
            batch.src.segment_ids = batch.src.ids_indicator
            batch.src.segment_pos = _GetSegmentPos(batch.src.ids_indicator)
            batch.tgt.segment_ids = batch.tgt.ids_indicator
            batch.tgt.segment_pos = _GetSegmentPos(batch.tgt.ids_indicator)
            return

        (src_segment_ids, src_segment_pos, src_indices_in_input,
         tgt_segment_ids, tgt_segment_pos,
         tgt_indices_in_input) = ops.pack_sequences(
             src_actual_seq_len, tgt_actual_seq_len, self._ScaledBatchSize(),
             self.params.source_max_length, self.params.target_max_length)

        uniq_src_indices_in_input = tf.unique(
            tf.reshape(src_indices_in_input, [-1])).y
        uniq_tgt_indices_in_input = tf.unique(
            tf.reshape(tgt_indices_in_input, [-1])).y
        summary_utils.histogram(
            'packed_source_seq_lengths',
            tf.gather(src_actual_seq_len, uniq_src_indices_in_input, axis=0))
        summary_utils.histogram(
            'packed_target_seq_lengths',
            tf.gather(tgt_actual_seq_len, uniq_tgt_indices_in_input, axis=0))

        # Ratio of number of non-padded tokens. If < 1.0, we are dropping
        # input data due to p.packing_factor too high.
        src_orig_tokens_count = tf.cast(tf.reduce_sum(src_actual_seq_len),
                                        tf.float32)
        src_packed_tokens_count = tf.reduce_sum(
            tf.cast(src_segment_ids > 0, tf.float32))
        summary_utils.scalar('examples/src_packed_token_ratio',
                             src_packed_tokens_count / src_orig_tokens_count)
        tgt_orig_tokens_count = tf.cast(tf.reduce_sum(tgt_actual_seq_len),
                                        tf.float32)
        tgt_packed_tokens_count = tf.reduce_sum(
            tf.cast(tgt_segment_ids > 0, tf.float32))
        summary_utils.scalar('examples/tgt_packed_token_ratio',
                             tgt_packed_tokens_count / tgt_orig_tokens_count)

        # We deferred adding .paddings and use its complement .ids_indicator
        # exclusively so that we can apply the packing with padding set to 0 for all
        # fields.
        def ApplyPackingToSource(x):
            if x.dtype == tf.string:
                return ops.apply_packing(x, '\t', src_segment_ids,
                                         src_indices_in_input)
            return ops.apply_packing(x, 0, src_segment_ids,
                                     src_indices_in_input)

        src_paddings = ops.apply_packing(batch.src.paddings, 1,
                                         src_segment_ids, src_indices_in_input)
        batch.src = batch.src.Transform(ApplyPackingToSource)
        batch.src.paddings = src_paddings
        batch.src.segment_ids = tf.cast(src_segment_ids, tf.float32)
        batch.src.segment_pos = src_segment_pos

        def ApplyPackingToTarget(x):
            if x.dtype == tf.string:
                return ops.apply_packing(x, '\t', tgt_segment_ids,
                                         tgt_indices_in_input)
            return ops.apply_packing(x, 0, tgt_segment_ids,
                                     tgt_indices_in_input)

        tgt_paddings = ops.apply_packing(batch.tgt.paddings, 1,
                                         tgt_segment_ids, tgt_indices_in_input)
        batch.tgt = batch.tgt.Transform(ApplyPackingToTarget)
        batch.tgt.paddings = tgt_paddings
        batch.tgt.segment_ids = tf.cast(tgt_segment_ids, tf.float32)
        batch.tgt.segment_pos = tgt_segment_pos

        # The number of examples is indicated by the segment_ids of the target.
        num_segments = tf.math.reduce_max(batch.tgt.segment_ids, axis=1)
        num_examples = tf.reduce_sum(num_segments)
        # Note that this is per infeed value when p.use_per_host_infeed = True.
        metric_name = 'examples/num_packed_examples'
        summary_utils.scalar(metric_name, num_examples)
Beispiel #12
0
    def _Pack(self, batch):
        """Packs a given batch.

    Note that this may change the batch size.

    This function packs the input batch and adds .segment_ids and .segment_pos
    fields to its `src` and `tgt` fields.

    Args:
      batch: a `.NestedMap` of input tensors to be packed. It is modified in
        place.
    """
        src_actual_seq_len = tf.math.reduce_sum(tf.cast(
            batch.src.ids_indicator, tf.int32),
                                                axis=1)
        tgt_actual_seq_len = tf.math.reduce_sum(tf.cast(
            batch.tgt.ids_indicator, tf.int32),
                                                axis=1)
        summary_utils.histogram('source_seq_lengths', src_actual_seq_len)
        summary_utils.histogram('target_seq_lengths', tgt_actual_seq_len)

        if not self.params.packing_factor:
            # Supply segment_ids and segment_pos with no packing.
            batch.src.segment_ids = batch.src.ids_indicator
            batch.src.segment_pos = _GetSegmentPos(batch.src.ids_indicator)
            batch.tgt.segment_ids = batch.tgt.ids_indicator
            batch.tgt.segment_pos = _GetSegmentPos(batch.tgt.ids_indicator)
            return

        (src_segment_ids, src_segment_pos, src_indices_in_input,
         tgt_segment_ids, tgt_segment_pos,
         tgt_indices_in_input) = ops.pack_sequences(
             src_actual_seq_len, tgt_actual_seq_len, self._ScaledBatchSize(),
             self.params.source_max_length, self.params.target_max_length)

        uniq_src_indices_in_input = tf.unique(
            tf.reshape(src_indices_in_input, [-1])).y
        uniq_tgt_indices_in_input = tf.unique(
            tf.reshape(tgt_indices_in_input, [-1])).y
        summary_utils.histogram(
            'packed_source_seq_lengths',
            tf.gather(src_actual_seq_len, uniq_src_indices_in_input, axis=0))
        summary_utils.histogram(
            'packed_target_seq_lengths',
            tf.gather(tgt_actual_seq_len, uniq_tgt_indices_in_input, axis=0))

        # We deferred adding .paddings and use its complement .ids_indicator
        # exclusively so that we can apply the packing with padding set to 0 for all
        # fields.
        def ApplyPackingToSource(x):
            if x.dtype == tf.string:
                return ops.apply_packing(x, '\t', src_segment_ids,
                                         src_indices_in_input)
            return ops.apply_packing(x, 0, src_segment_ids,
                                     src_indices_in_input)

        src_paddings = ops.apply_packing(batch.src.paddings, 1,
                                         src_segment_ids, src_indices_in_input)
        batch.src = batch.src.Transform(ApplyPackingToSource)
        batch.src.paddings = src_paddings
        batch.src.segment_ids = tf.cast(src_segment_ids, tf.float32)
        batch.src.segment_pos = src_segment_pos

        def ApplyPackingToTarget(x):
            if x.dtype == tf.string:
                return ops.apply_packing(x, '\t', tgt_segment_ids,
                                         tgt_indices_in_input)
            return ops.apply_packing(x, 0, tgt_segment_ids,
                                     tgt_indices_in_input)

        tgt_paddings = ops.apply_packing(batch.tgt.paddings, 1,
                                         tgt_segment_ids, tgt_indices_in_input)
        batch.tgt = batch.tgt.Transform(ApplyPackingToTarget)
        batch.tgt.paddings = tgt_paddings
        batch.tgt.segment_ids = tf.cast(tgt_segment_ids, tf.float32)
        batch.tgt.segment_pos = tgt_segment_pos
Beispiel #13
0
 def testApplyPackingErrors(self, expected_error_type, expected_error,
                            test):
     with self.assertRaisesRegex(expected_error_type, expected_error):
         with self.session():
             ops.apply_packing(test.input, test.padding, test.segment_ids,
                               test.indices_in_input).eval()
Beispiel #14
0
    def testPackSingleSequence(self, input_lengths, max_packed_length,
                               require_sequential_order, expected_packed_idxs):
        with self.session() as sess:
            np.random.seed(12345)
            segment_ids, indices_in_input = sess.run(
                ops.pack_single_sequence(
                    input_lengths=input_lengths,
                    max_packed_length=max_packed_length,
                    require_sequential_order=require_sequential_order))
            self.assertLen(expected_packed_idxs, segment_ids.shape[0])

            # Test the output is compatible with apply_packing.
            inputs = []
            for i, length in enumerate(input_lengths):
                inputs.append(
                    np.random.randint(100000,
                                      size=[length, 2, 2],
                                      dtype=np.int32))
            outputs = sess.run(
                ops.apply_packing(input=tf.stack([
                    tf.pad(
                        x,
                        [[0, max_packed_length - x.shape[0]], [0, 0], [0, 0]])
                    for x in inputs
                ]),
                                  padding=0,
                                  segment_ids=segment_ids,
                                  indices_in_input=indices_in_input))

            for segment_id, idxs, output, expected_idxs in zip(
                    segment_ids, indices_in_input, outputs,
                    expected_packed_idxs):
                # Build the expected results from the provided expected_packed_idxs.
                expected_segment_ids = []
                expected_idxs_vec = []
                expected_outputs = []
                for i, idx in enumerate(expected_idxs):
                    expected_segment_ids += [i + 1] * input_lengths[idx]
                    expected_idxs_vec += [idx] * input_lengths[idx]
                    expected_outputs.append(inputs[idx])
                expected_outputs = np.concatenate(expected_outputs)
                expected_packed_length = len(expected_outputs)
                self.assertLessEqual(expected_packed_length, max_packed_length)
                self.assertLen(expected_segment_ids, expected_packed_length)
                self.assertLen(expected_idxs_vec, expected_packed_length)

                # Check indices_in_input is non-decreasing.
                if expected_packed_length > 1:
                    self.assertAllGreaterEqual(
                        idxs[1:expected_packed_length] -
                        idxs[:expected_packed_length - 1], 0)

                # Pad to max_packed_length.
                pad_len = max_packed_length - expected_packed_length
                expected_segment_ids += [0] * pad_len
                expected_idxs_vec += [-1] * pad_len
                expected_outputs = np.pad(expected_outputs, [(0, pad_len),
                                                             (0, 0), (0, 0)],
                                          mode='constant')

                self.assertAllEqual(expected_idxs_vec, idxs)
                self.assertAllEqual(expected_segment_ids, segment_id)
                self.assertAllEqual(expected_outputs, output)