Ejemplo n.º 1
0
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_train_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_train_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  num_dev_shards = int(num_train_shards * 0.1)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_dev_shards)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  num_test_shards = int(num_train_shards * 0.1)
  test_output_files = []
  test_gen_data = test_gen()
  if test_gen_data is not None:
    tf.logging.info("Generating test data for %s.", problem)
    test_output_files = generator_utils.test_data_filenames(
        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
        num_test_shards)
    generator_utils.generate_files(test_gen_data, test_output_files)
  all_output_files = train_output_files + dev_output_files + test_output_files
  generator_utils.shuffle_dataset(all_output_files)
  def testGenerateFiles(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a trivial file and assert the file exists.
    def test_generator():
      yield {"inputs": [1], "target": [1]}

    filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1)
    generator_utils.generate_files(test_generator(), filenames)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))

    # Clean up.
    os.remove(tmp_file_path + "-train-00000-of-00001")
    os.remove(tmp_file_path)
Ejemplo n.º 3
0
 def training_filepaths(self, data_dir, num_shards, shuffled):
     file_basename = self.dataset_filename()
     if not shuffled:
         file_basename += generator_utils.UNSHUFFLED_SUFFIX
     return generator_utils.train_data_filenames(file_basename, data_dir,
                                                 num_shards)
Ejemplo n.º 4
0
    def _testBatchExamples(self):
        tf.set_random_seed(1)
        tmp_dir = self.get_temp_dir()
        (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
        tmp_file_name = os.path.basename(tmp_file_path)

        # Generate a file with 100 examples, n-th example of length n + 1.
        def test_generator():
            for i in xrange(100):
                yield {
                    "inputs": [i + 1 for _ in xrange(i + 1)],
                    "targets": [i + 1]
                }

        filenames = generator_utils.train_data_filenames(
            tmp_file_name, tmp_dir, 1)
        generator_utils.generate_files(test_generator(), filenames)
        self.assertTrue(
            tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))

        examples_train = data_reader.examples_reader(
            [tmp_file_path + "*"], {
                "inputs": tf.VarLenFeature(tf.int64),
                "targets": tf.VarLenFeature(tf.int64)
            }, True)
        batch_train = data_reader.batch_examples(examples_train, 4)
        examples_eval = data_reader.examples_reader(
            [tmp_file_path + "*"], {
                "inputs": tf.VarLenFeature(tf.int64),
                "targets": tf.VarLenFeature(tf.int64)
            }, False)
        batch_eval = data_reader.batch_examples(examples_eval, 2)
        session, coord = tf.Session(), tf.train.Coordinator()
        with session.as_default():
            tf.train.start_queue_runners(coord=coord)

            # Evaluation data comes in the same order as in the file.
            # The first batch will be inputs=[[1, 0], [2, 2]], targets=[[1], [2]].
            examples = session.run(batch_eval)
            self.assertAllClose(examples["inputs"], np.array([[1, 0], [2, 2]]))
            self.assertAllClose(examples["targets"], np.array([[1], [2]]))
            # Check the second batch too.
            examples = session.run(batch_eval)
            self.assertAllClose(examples["inputs"],
                                np.array([[3, 3, 3, 0], [4, 4, 4, 4]]))
            self.assertAllClose(examples["targets"], np.array([[3], [4]]))

            # Training data is shuffled but shouldn't have too many pads.
            for _ in xrange(10):
                examples = session.run(batch_train)
                inputs = examples["inputs"]
                # Only 3 out of 4 examples in a batch have padding zeros at all.
                pad_per_example = (inputs.size - np.count_nonzero(inputs)) // 3
                # Default bucketing is in steps of 8 until 64 and 32 later.
                if int(max(examples["targets"])) < 64:
                    self.assertLess(pad_per_example, 8)
                else:
                    self.assertLess(pad_per_example, 32)

        # Clean up.
        coord.request_stop()
        coord.join()
        os.remove(tmp_file_path + "-train-00000-of-00001")
        os.remove(tmp_file_path)
Ejemplo n.º 5
0
 def training_filepaths(self, data_dir, num_shards, shuffled):
   file_basename = self.dataset_filename()
   if not shuffled:
     file_basename += generator_utils.UNSHUFFLED_SUFFIX
   return generator_utils.train_data_filenames(file_basename, data_dir,
                                               num_shards)
Ejemplo n.º 6
0
def generate_tf_records(out_path="/home/abhishek/tmp/tf-records",input_path="/home/abhishek/tmp/LJSpeech-1.1", num_shards=100):
    train_paths = generator_utils.train_data_filenames("lj_speech", out_path, num_shards)
    char2idx, _ = data_load.load_vocab()
    generator_utils.generate_files(_generator(input_path, char2idx), train_paths)