Exemple #1
0
    def _DataSourceFromFilePattern(self, file_pattern):
        def Proc(record):
            """Parses a serialized tf.Example record."""
            outputs = [
                ('source_id', tf.VarLenFeature(tf.int64)),
                ('source_padding', tf.VarLenFeature(tf.float32)),
                ('target_id', tf.VarLenFeature(tf.int64)),
                ('target_padding', tf.VarLenFeature(tf.float32)),
                ('target_label', tf.VarLenFeature(tf.int64)),
                ('target_weight', tf.VarLenFeature(tf.float32)),
            ]
            features = tf.parse_single_example(record, dict(outputs))
            for k, v in six.iteritems(features):
                features[k] = v.values
            bucket_key = tf.to_int32(
                tf.maximum(tf.reduce_sum(1.0 - features['source_padding']),
                           tf.reduce_sum(1.0 - features['target_padding'])))
            return [features[k] for k, _ in outputs] + [bucket_key]

        return py_x_ops.generic_input(
            file_pattern=file_pattern,
            processor=Proc,
            dynamic_padding_dimensions=[0] * 6,
            dynamic_padding_constants=[0, 1, 0, 1, 0, 0],
            **self.CommonInputOpArgs())
Exemple #2
0
 def _DataSourceFromFilePattern(self, file_pattern):
     return py_x_ops.generic_input(
         file_pattern=file_pattern,
         processor=self._ProcessLine,
         dynamic_padding_dimensions=[0] * 6,
         dynamic_padding_constants=[0, 1, 0, 1, 0, 0],
         **self.CommonInputOpArgs())
    def _DataSourceFromFilePattern(self, file_pattern):
        def Proc(record):
            """Parses a serialized tf.Example record."""
            features = [
                ('uttid', tf.VarLenFeature(tf.string)),
                ('transcript', tf.VarLenFeature(tf.string)),
                ('frames', tf.VarLenFeature(tf.float32)),
            ]
            example = tf.parse_single_example(record, dict(features))
            fval = {k: v.values for k, v in six.iteritems(example)}
            # Reshape the flattened vector into its original time-major
            # representation.
            fval['frames'] = tf.reshape(fval['frames'],
                                        shape=[-1, self.params.frame_size])
            # Input duration determines the bucket.
            bucket_key = tf.to_int32(tf.shape(fval['frames'])[0])
            if self.params.append_eos_frame:
                bucket_key += 1
            tgt_ids, tgt_labels, tgt_paddings = self.StringsToIds(
                fval['transcript'])
            src_paddings = tf.zeros([tf.shape(fval['frames'])[0]],
                                    dtype=tf.float32)
            return fval['uttid'], tgt_ids, tgt_labels, tgt_paddings, fval[
                'frames'], src_paddings, bucket_key

        return py_x_ops.generic_input(file_pattern=file_pattern,
                                      processor=Proc,
                                      dynamic_padding_dimensions=[0] * 6,
                                      dynamic_padding_constants=[0] * 5 + [1],
                                      **self.CommonInputOpArgs())
Exemple #4
0
 def get_test_input(self, path, **kwargs):
     return py_x_ops.generic_input(file_pattern='tfrecord:' + path,
                                   file_random_seed=0,
                                   file_buffer_size=32,
                                   file_parallelism=4,
                                   bucket_batch_limit=[8],
                                   **kwargs)
Exemple #5
0
 def get_test_input(self, path, **kwargs):
   return py_x_ops.generic_input(
       file_pattern=','.join(['tfrecord:' + path, 'tfrecord:' + path]),
       input_source_weights=[0.3, 0.7],
       file_random_seed=0,
       file_buffer_size=32,
       file_parallelism=4,
       bucket_batch_limit=[8],
       **kwargs)
    def _DataSourceFromFilePattern(self, file_pattern):
        def ReadInput(line):
            word_count = tf.size(tf.strings.split([line]))
            strlen = tf.size(tf.strings.split([line], ''))
            return line, word_count, strlen

        return py_x_ops.generic_input(file_pattern=file_pattern,
                                      processor=ReadInput,
                                      **self.CommonInputOpArgs())
Exemple #7
0
    def testMix(self):
        # Generate couple files.
        def generate_test_data(tag, cnt):
            tmp = os.path.join(tf.test.get_temp_dir(), tag)
            with tf.python_io.TFRecordWriter(tmp) as w:
                for i in range(cnt):
                    w.write('%s:%08d' % (tag, i))
            return tmp

        path1 = generate_test_data('input1', 100)
        path2 = generate_test_data('input2', 200)
        path3 = generate_test_data('input3', 10)

        g = tf.Graph()
        with g.as_default():
            # A record processor written in TF graph.
            def _process(record):
                return record, record, tf.to_int32(1)

            # Samples random records from the data files and processes them
            # to generate batches.
            strs, vals = py_x_ops.generic_input(
                file_pattern=','.join([
                    'tfrecord:' + path1, 'tfrecord:' + path2,
                    'tfrecord:' + path3
                ]),
                input_source_weights=[0.2, 0.3, 0.5],
                file_random_seed=0,
                file_buffer_size=32,
                file_parallelism=4,
                bucket_batch_limit=[8],
                bucket_upper_bound=[1],
                processor=_process)

        with self.session(graph=g) as sess:
            tags_count = collections.defaultdict(int)
            total_count = 10000
            for _ in range(total_count):
                ans_strs, ans_vals = sess.run([strs, vals])
                for s in ans_strs:
                    tags_count[s.split(':')[0]] += 1
                self.assertEqual(ans_strs.shape, (8, ))
                self.assertEqual(ans_vals.shape, (8, ))
            self.assertEqual(sum(tags_count.values()), total_count * 8)
            mix_ratios = {}
            for k, v in tags_count.iteritems():
                mix_ratios[k] = float(v) / total_count / 8
            self.assertAlmostEqual(mix_ratios['input1'], 0.2, delta=0.01)
            self.assertAlmostEqual(mix_ratios['input2'], 0.3, delta=0.01)
            self.assertAlmostEqual(mix_ratios['input3'], 0.5, delta=0.01)
    def _DataSourceFromFilePattern(self, file_pattern):
        """Create the input processing op.

    Args:
      file_pattern: The file pattern to use as input.

    Returns:
      an operation that when executed, calls `_ProcessLine` on a line read
    from `file_pattern`.
    """
        return py_x_ops.generic_input(
            file_pattern=file_pattern,
            processor=self._ProcessLine,
            # Pad dimension 0 to the same length.
            dynamic_padding_dimensions=[0] * 6,
            # The constant values to use for padding each of the outputs.
            dynamic_padding_constants=[0, 1, 0, 1, 0, 0],
            **self.CommonInputOpArgs())
Exemple #9
0
  def _DataSourceFromFilePattern(self, file_pattern):
    
    if self.params.use_sst:
      return py_x_ops.lm_text_input(
        file_pattern=file_pattern,
        normalization='',
        proto='string',
        **self.CommonInputOpArgs())
    else:
      def ReadInput(line):
        word_count = tf.size(tf.strings.split([line]))
        strlen = tf.size(tf.strings.split([line], ''))
        return line, word_count, strlen

      return py_x_ops.generic_input(
          file_pattern=file_pattern,
          processor=ReadInput,
          **self.CommonInputOpArgs())
Exemple #10
0
    def testPadding(self):
        # Generate a test file w/ 50 records of different lengths.
        tmp = os.path.join(tf.test.get_temp_dir(), 'basic')
        with tf.python_io.TFRecordWriter(tmp) as w:
            for n in range(1, 50):
                w.write(pickle.dumps(np.full([n, 3, 3], n, np.int32)))

        g = tf.Graph()
        with g.as_default():
            # A record processor written in TF graph.
            def _process(record):
                num = tf.py_func(lambda x: pickle.loads(x), [record], tf.int32)
                bucket_key = tf.shape(num)[0]
                return num, tf.transpose(num, [1, 0, 2]), bucket_key

            # Samples random records from the data files and processes them
            # to generate batches.
            vals_t, transposed_vals_t = py_x_ops.generic_input(
                file_pattern='tfrecord:' + tmp,
                file_random_seed=0,
                file_buffer_size=32,
                file_parallelism=4,
                bucket_upper_bound=[10],
                bucket_batch_limit=[8],
                processor=_process,
                dynamic_padding_dimensions=[0, 1],
                dynamic_padding_constants=[0] * 2)

        with self.session(graph=g) as sess:
            for i in range(10):
                vals, transposed_vals = sess.run([vals_t, transposed_vals_t])
                print(vals, np.transpose(transposed_vals, [0, 2, 1, 3]))
                self.assertEqual(vals.shape[0], 8)
                self.assertEqual(vals.shape[2], 3)
                self.assertEqual(vals.shape[3], 3)
                largest = np.amax(vals)
                self.assertLessEqual(largest, 10)
                self.assertEqual(vals.shape[1], largest)
                for j in range(8):
                    n = vals[j, 0, 0, 0]
                    self.assertTrue(np.all(vals[j, :n] == n))
                    self.assertTrue(np.all(vals[j, n:] == 0))
                self.assertAllEqual(
                    vals, np.transpose(transposed_vals, [0, 2, 1, 3]))
Exemple #11
0
    def testBasic(self):
        # Generate a test file w/ 100 records.
        tmp = os.path.join(tf.test.get_temp_dir(), 'basic')
        with tf.python_io.TFRecordWriter(tmp) as w:
            for i in range(100):
                w.write('%08d' % i)

        g = tf.Graph()
        with g.as_default():

            # A simple string parsing routine. Just convert a string to a
            # number.
            def str_to_num(s):
                return np.array(float(s), dtype=np.float32)

            # A record processor written in TF graph.
            def _process(record):
                num, = tf.py_func(str_to_num, [record], [tf.float32])
                return record, tf.stack([num, tf.square(num)]), tf.to_int32(1)

            # Samples random records from the data files and processes them
            # to generate batches.
            strs, vals = py_x_ops.generic_input(file_pattern='tfrecord:' + tmp,
                                                file_random_seed=0,
                                                file_buffer_size=32,
                                                file_parallelism=4,
                                                bucket_upper_bound=[1],
                                                bucket_batch_limit=[8],
                                                processor=_process)

        with self.session(graph=g) as sess:
            record_seen = set()
            for i in range(100):
                ans_strs, ans_vals = sess.run([strs, vals])
                for s in ans_strs:
                    record_seen.add(s)
                self.assertEqual(ans_strs.shape, (8, ))
                self.assertEqual(ans_vals.shape, (8, 2))
                self.assertAllEqual(np.square(ans_vals[:, 0]), ans_vals[:, 1])
            for i in range(100):
                self.assertTrue('%08d' % i in record_seen)