def generate_data_for_problem(problem): """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS.""" training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem] num_train_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_train_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) num_dev_shards = int(num_train_shards * 0.1) tf.logging.info("Generating development data for %s.", problem) dev_output_files = generator_utils.dev_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_dev_shards) generator_utils.generate_files(dev_gen(), dev_output_files) num_test_shards = int(num_train_shards * 0.1) test_output_files = [] test_gen_data = test_gen() if test_gen_data is not None: tf.logging.info("Generating test data for %s.", problem) test_output_files = generator_utils.test_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_test_shards) generator_utils.generate_files(test_gen_data, test_output_files) all_output_files = train_output_files + dev_output_files + test_output_files generator_utils.shuffle_dataset(all_output_files)
def testGenerateFiles(self): tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Generate a trivial file and assert the file exists. def test_generator(): yield {"inputs": [1], "target": [1]} filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1) generator_utils.generate_files(test_generator(), filenames) self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001")) # Clean up. os.remove(tmp_file_path + "-train-00000-of-00001") os.remove(tmp_file_path)
def training_filepaths(self, data_dir, num_shards, shuffled): file_basename = self.dataset_filename() if not shuffled: file_basename += generator_utils.UNSHUFFLED_SUFFIX return generator_utils.train_data_filenames(file_basename, data_dir, num_shards)
def _testBatchExamples(self): tf.set_random_seed(1) tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Generate a file with 100 examples, n-th example of length n + 1. def test_generator(): for i in xrange(100): yield { "inputs": [i + 1 for _ in xrange(i + 1)], "targets": [i + 1] } filenames = generator_utils.train_data_filenames( tmp_file_name, tmp_dir, 1) generator_utils.generate_files(test_generator(), filenames) self.assertTrue( tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001")) examples_train = data_reader.examples_reader( [tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, True) batch_train = data_reader.batch_examples(examples_train, 4) examples_eval = data_reader.examples_reader( [tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, False) batch_eval = data_reader.batch_examples(examples_eval, 2) session, coord = tf.Session(), tf.train.Coordinator() with session.as_default(): tf.train.start_queue_runners(coord=coord) # Evaluation data comes in the same order as in the file. # The first batch will be inputs=[[1, 0], [2, 2]], targets=[[1], [2]]. examples = session.run(batch_eval) self.assertAllClose(examples["inputs"], np.array([[1, 0], [2, 2]])) self.assertAllClose(examples["targets"], np.array([[1], [2]])) # Check the second batch too. examples = session.run(batch_eval) self.assertAllClose(examples["inputs"], np.array([[3, 3, 3, 0], [4, 4, 4, 4]])) self.assertAllClose(examples["targets"], np.array([[3], [4]])) # Training data is shuffled but shouldn't have too many pads. for _ in xrange(10): examples = session.run(batch_train) inputs = examples["inputs"] # Only 3 out of 4 examples in a batch have padding zeros at all. pad_per_example = (inputs.size - np.count_nonzero(inputs)) // 3 # Default bucketing is in steps of 8 until 64 and 32 later. if int(max(examples["targets"])) < 64: self.assertLess(pad_per_example, 8) else: self.assertLess(pad_per_example, 32) # Clean up. coord.request_stop() coord.join() os.remove(tmp_file_path + "-train-00000-of-00001") os.remove(tmp_file_path)
def generate_tf_records(out_path="/home/abhishek/tmp/tf-records",input_path="/home/abhishek/tmp/LJSpeech-1.1", num_shards=100): train_paths = generator_utils.train_data_filenames("lj_speech", out_path, num_shards) char2idx, _ = data_load.load_vocab() generator_utils.generate_files(_generator(input_path, char2idx), train_paths)