def generate_data(self, data_dir, _): generator_utils.generate_files( algorithmic.identity_generator(self.num_symbols, 40, 100000), self.training_filepaths(data_dir, 1, shuffled=True), 100) generator_utils.generate_files( algorithmic.identity_generator(self.num_symbols, 400, 10000), self.dev_filepaths(data_dir, 1, shuffled=True), 100)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, 1, shuffled=True) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=True) generator_utils.generate_files(self.generator(data_dir, tmp_dir, True), train_paths) generator_utils.generate_files( self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): self.data_dir = data_dir # Determine whether we are in training or validation mode. self.mode = {problem.DatasetSplit.TRAIN: 'train', problem.DatasetSplit.EVAL: 'dev', problem.DatasetSplit.TEST: 'test'} filepath_fns = {problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths} split_paths = [(split['split'], filepath_fns[split['split']]( data_dir, split['shards'], shuffled=self.already_shuffled)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: # Create the source and target txt files from the raw data. self.preprocess_data(self.mode[split]) generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: self.preprocess_data(self.mode[problem.DatasetSplit.TRAIN]) generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=self.already_shuffled)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths(data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """The function generating the data.""" filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } # We set shuffled=True as we don't want to shuffle on disk later. split_paths = [(split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=True)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples_debug( data_dir, tmp_dir, split), paths, cycle_every_n=self.total_number_of_frames // len(paths)) else: generator_utils.generate_files( self.generate_encoded_samples_debug( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths, cycle_every_n=self.total_number_of_frames // len(all_paths))
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, 1, shuffled=True) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), train_paths) generator_utils.generate_files( self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Saves the rollout history to disk.""" # Shuffle rollouts globally taking advantage of the fact that we have # everything in memory. epoch_rollout_tuples = list() for epoch_nr, rollouts in self.rollouts_by_epoch.items(): for rollout in rollouts: epoch_rollout_tuples.append((epoch_nr, rollout)) random.shuffle(epoch_rollout_tuples) filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } # We set shuffled=True as we don't want to shuffle on disk later. paths = [ path for split in self.dataset_splits for path in filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=True ) ] num_frames = sum(len(rollout) for (_, rollout) in epoch_rollout_tuples) shard_size = num_frames // len(paths) generator_utils.generate_files( self._generate_frames(epoch_rollout_tuples), paths, cycle_every_n=shard_size )
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=False)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """The function generating the data.""" filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } # We set shuffled=True as we don't want to shuffle on disk later. split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=True)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples_debug( data_dir, tmp_dir, split), paths, cycle_every_n=self.total_number_of_frames // len(paths)) else: generator_utils.generate_files( self.generate_encoded_samples_debug( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths, cycle_every_n=self.total_number_of_frames // len(all_paths))
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = dict([(split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=False)) for split in self.dataset_splits]) all_paths = [] for paths in split_paths.values(): all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths.items(): generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples(data_dir, tmp_dir, split)), paths) else: generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN)), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): # 构造对应文件名的方法dict filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } # 根据dataset_splits 的设置,得到各类文件名 split_paths = [ (split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=self.already_shuffled)) for split in self.dataset_splits ] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) # split 是train/eval/test,对应的paths是对应类别下根据shards生成的文件名list if self.is_generate_per_split: for split, paths in split_paths: # generate_files(generator, filenames) 将generator生成的token id sample 写入 filenames generator_utils.generate_files( # generate_encoded_samples yield token id形式的sample self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
def generate_data(self, data_dir, tmp_dir, task_id=-1): del tmp_dir # unused argument filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split['split'], filepath_fns[split['split']](data_dir, split['shards'], shuffled=False)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Saves the rollout history to disk, split into train/dev sets.""" self._split_current_epoch() splits_and_paths = self.splits_and_paths(data_dir) num_epochs = len(self._rollouts_by_epoch_and_split) for (epoch_index, (epoch, rollouts_by_split)) in enumerate( six.iteritems(self._rollouts_by_epoch_and_split)): for (split, paths) in splits_and_paths: num_shards = len(paths) // num_epochs paths = paths[epoch_index * num_shards:(epoch_index + 1) * num_shards] rollouts = rollouts_by_split[split] num_frames = self._calc_num_frames(rollouts) shard_size = num_frames // len(paths) frame_gen = self._generate_frames(epoch, rollouts) for (path_index, path) in enumerate(paths): limit = shard_size # Put the remainder in the last shard to preserve the ordering. if path_index == len(paths) - 1: limit = None generator_utils.generate_files(itertools.islice( frame_gen, limit), [path], cycle_every_n=float("inf"))
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Saves the rollout history to disk.""" # Suffle rollouts globally taking advantage of the fact that we have # everything in memory. shuffled_history = self.history[:] random.shuffle(shuffled_history) filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } # We set shuffled=True as we don't want to shuffle on disk later. splits_and_paths = [ (split["split"], path) for split in self.dataset_splits for path in filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=True ) ] # Split entire rollouts into shards so that no rollout is broken on shard # boundary. shard_size = int(math.ceil(len(shuffled_history)) / len(splits_and_paths)) for (i, (split, path)) in enumerate(splits_and_paths): rollouts = shuffled_history[i * shard_size : (i + 1) * shard_size] generator_utils.generate_files( self._generate_frames(rollouts), [path], cycle_every_n=float("inf") )
def generate_data_shard(self, thread_ix, meta_list, out_file, encoder): tf.logging.info("[thread %d], %d image-label pairs" % (thread_ix, len(meta_list))) generator_utils.generate_files( _example_generator(meta_list, self.image_shape, encoder, self.get_helper()), [out_file])
def generate_data(self, data_dir, _): identity_problem = algorithmic.AlgorithmicIdentityBinary40() generator_utils.generate_files( identity_problem.generator(self.num_symbols, 40, 100000), self.training_filepaths(data_dir, 1, shuffled=True), 100) generator_utils.generate_files( identity_problem.generator(self.num_symbols, 400, 10000), self.dev_filepaths(data_dir, 1, shuffled=True), 100)
def generate_data(self, data_dir, _): identity_problem = algorithmic.AlgorithmicIdentityBinary40() generator_utils.generate_files( identity_problem.generator(self.num_symbols, 40, 100000), self.training_filepaths(data_dir, 1, shuffled=True), 100) generator_utils.generate_files( identity_problem.generator(self.num_symbols, 400, 10000), self.dev_filepaths(data_dir, 1, shuffled=True), 100)
def encode_dataset(model, dataset, problem, ae_hparams, autoencoder_path, out_files): """Encode all frames in dataset with model and write them out to out_files.""" batch_size = 8 dataset = dataset.batch(batch_size) examples = dataset.make_one_shot_iterator().get_next() images = examples.pop("frame") images = tf.cast(images, tf.int32) encoded = model.encode(images) encoded_frame_height = int( math.ceil(problem.frame_height / 2**ae_hparams.num_hidden_layers)) encoded_frame_width = int( math.ceil(problem.frame_width / 2**ae_hparams.num_hidden_layers)) num_bits = 8 encoded = tf.reshape( encoded, [-1, encoded_frame_height, encoded_frame_width, 3, num_bits]) encoded = tf.cast(discretization.bit_to_int(encoded, num_bits), tf.uint8) pngs = tf.map_fn(tf.image.encode_png, encoded, dtype=tf.string, back_prop=False) with tf.Session() as sess: autoencoder_saver = tf.train.Saver( tf.global_variables("autoencoder.*")) trainer_lib.restore_checkpoint(autoencoder_path, autoencoder_saver, sess, must_restore=True) def generator(): """Generate examples.""" while True: try: pngs_np, examples_np = sess.run([pngs, examples]) rewards = examples_np["reward"].tolist() actions = examples_np["action"].tolist() frame_numbers = examples_np["frame_number"].tolist() for action, reward, frame_number, png in \ zip(actions, rewards, frame_numbers, pngs_np): yield { "action": action, "reward": reward, "frame_number": frame_number, "image/encoded": [png], "image/format": ["png"], "image/height": [encoded_frame_height], "image/width": [encoded_frame_width], } except tf.errors.OutOfRangeError: break generator_utils.generate_files( generator(), out_files, cycle_every_n=problem.total_number_of_frames // 10)
def setUpClass(cls): # Generate a small test dataset FLAGS.problems = "algorithmic_addition_binary40" TrainerUtilsTest.data_dir = tf.test.get_temp_dir() gen = algorithmic.identity_generator(2, 10, 300) generator_utils.generate_files(gen, FLAGS.problems + "-train", TrainerUtilsTest.data_dir, 1, 100) generator_utils.generate_files(gen, FLAGS.problems + "-dev", TrainerUtilsTest.data_dir, 1, 100)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir), all_paths) generator_utils.shuffle_dataset(all_paths)
def _testBatchExamples(self): tf.set_random_seed(1) tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Generate a file with 100 examples, n-th example of length n + 1. def test_generator(): for i in xrange(100): yield {"inputs": [i + 1 for _ in xrange(i + 1)], "targets": [i + 1]} filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1) generator_utils.generate_files(test_generator(), filenames) self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001")) examples_train = data_reader.examples_queue([tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, True) batch_train = data_reader.batch_examples(examples_train, 4) examples_eval = data_reader.examples_queue([tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, False) batch_eval = data_reader.batch_examples(examples_eval, 2) session, coord = tf.Session(), tf.train.Coordinator() with session.as_default(): tf.train.start_queue_runners(coord=coord) # Evaluation data comes in the same order as in the file. # The first batch will be inputs=[[1, 0], [2, 2]], targets=[[1], [2]]. examples = session.run(batch_eval) self.assertAllClose(examples["inputs"], np.array([[1, 0], [2, 2]])) self.assertAllClose(examples["targets"], np.array([[1], [2]])) # Check the second batch too. examples = session.run(batch_eval) self.assertAllClose(examples["inputs"], np.array([[3, 3, 3, 0], [4, 4, 4, 4]])) self.assertAllClose(examples["targets"], np.array([[3], [4]])) # Training data is shuffled but shouldn't have too many pads. for _ in xrange(10): examples = session.run(batch_train) inputs = examples["inputs"] # Only 3 out of 4 examples in a batch have padding zeros at all. pad_per_example = (inputs.size - np.count_nonzero(inputs)) // 3 # Default bucketing is in steps of 8 until 64 and 32 later. if int(max(examples["targets"])) < 64: self.assertLess(pad_per_example, 8) else: self.assertLess(pad_per_example, 32) # Clean up. coord.request_stop() coord.join() os.remove(tmp_file_path + "-train-00000-of-00001") os.remove(tmp_file_path)
def setUpClass(cls): # Generate a small test dataset FLAGS.problems = "algorithmic_addition_binary40" TrainerUtilsTest.data_dir = tf.test.get_temp_dir() gen = algorithmic.identity_generator(2, 10, 300) generator_utils.generate_files(gen, FLAGS.problems + "-train", TrainerUtilsTest.data_dir, 1, 100) generator_utils.generate_files(gen, FLAGS.problems + "-dev", TrainerUtilsTest.data_dir, 1, 100)
def _testBatchExamples(self): tf.set_random_seed(1) tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Generate a file with 100 examples, n-th example of length n + 1. def test_generator(): for i in xrange(100): yield {"inputs": [i + 1 for _ in xrange(i + 1)], "targets": [i + 1]} generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir) self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001")) examples_train = data_reader.examples_queue([tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, True) batch_train = data_reader.batch_examples(examples_train, 4) examples_eval = data_reader.examples_queue([tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, False) batch_eval = data_reader.batch_examples(examples_eval, 2) session, coord = tf.Session(), tf.train.Coordinator() with session.as_default(): tf.train.start_queue_runners(coord=coord) # Evaluation data comes in the same order as in the file. # The first batch will be inputs=[[1, 0], [2, 2]], targets=[[1], [2]]. examples = session.run(batch_eval) self.assertAllClose(examples["inputs"], np.array([[1, 0], [2, 2]])) self.assertAllClose(examples["targets"], np.array([[1], [2]])) # Check the second batch too. examples = session.run(batch_eval) self.assertAllClose(examples["inputs"], np.array([[3, 3, 3, 0], [4, 4, 4, 4]])) self.assertAllClose(examples["targets"], np.array([[3], [4]])) # Training data is shuffled but shouldn't have too many pads. for _ in xrange(10): examples = session.run(batch_train) inputs = examples["inputs"] # Only 3 out of 4 examples in a batch have padding zeros at all. pad_per_example = (inputs.size - np.count_nonzero(inputs)) // 3 # Default bucketing is in steps of 8 until 64 and 32 later. if int(max(examples["targets"])) < 64: self.assertLess(pad_per_example, 8) else: self.assertLess(pad_per_example, 32) # Clean up. coord.request_stop() coord.join() os.remove(tmp_file_path + "-00000-of-00001") os.remove(tmp_file_path)
def generate_dataset(h5_filepath, key_prefix, out_filepaths, start_idx=None, end_idx=None): print("PID: %d, Key: %s, (Start, End): (%s, %s)" % (os.getpid(), key_prefix, start_idx, end_idx)) generator_utils.generate_files( dataset_generator(h5_filepath, key_prefix, start_idx, end_idx), out_filepaths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Ganerate data for this problem.""" del tmp_dir, task_id identity_problem = AlgorithmicIdentityBinary40() utils.generate_files( identity_problem.generator(self.num_symbols, 40, 100000), self.training_filepaths(data_dir, 1, shuffled=True), 100) utils.generate_files( identity_problem.generator(self.num_symbols, 400, 10000), self.dev_filepaths(data_dir, 1, shuffled=True), 100)
def setUpClass(cls): # Generate a small test dataset FLAGS.problems = "algorithmic_addition_binary40" TrainerUtilsTest.data_dir = tf.test.get_temp_dir() gen = algorithmic.identity_generator(2, 10, 300) train_filenames = generator_utils.train_data_filenames( FLAGS.problems, TrainerUtilsTest.data_dir, 1) dev_filenames = generator_utils.dev_data_filenames( FLAGS.problems, TrainerUtilsTest.data_dir, 1) generator_utils.generate_files(gen, train_filenames, 100) generator_utils.generate_files(gen, dev_filenames, 100)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Ganerate data for this problem.""" del tmp_dir, task_id identity_problem = AlgorithmicIdentityBinary40() utils.generate_files( identity_problem.generator(self.num_symbols, 40, 100000), self.training_filepaths(data_dir, 1, shuffled=True), 100) utils.generate_files( identity_problem.generator(self.num_symbols, 400, 10000), self.dev_filepaths(data_dir, 1, shuffled=True), 100)
def generate_dataset(h5_filepath, key_prefix, out_filepaths, chunk_size=1, start_idx=None, end_idx=None): print("PID: %d, Key: %s, (Start, End): (%s, %s)" % (os.getpid(), key_prefix, start_idx, end_idx)) generator_utils.generate_files( dataset_generator(h5_filepath, key_prefix, chunk_size, start_idx, end_idx), out_filepaths)
def testExamplesQueue(self): tf.set_random_seed(1) tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Generate a file with 100 examples. def test_generator(): for i in xrange(100): yield {"inputs": [i], "targets": [i], "floats": [i + 0.5]} filenames = generator_utils.train_data_filenames( tmp_file_name, tmp_dir, 1) generator_utils.generate_files(test_generator(), filenames) self.assertTrue( tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001")) examples_train = data_reader.examples_reader( [tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, training=True) examples_eval = data_reader.examples_reader( [tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64), "floats": tf.VarLenFeature(tf.float32) }, training=False) with tf.train.MonitoredSession() as session: # Evaluation data comes in the same order as in the file, check 10. for i in xrange(10): examples = session.run(examples_eval) self.assertEqual(len(examples["inputs"]), 1) self.assertEqual(len(examples["targets"]), 1) self.assertEqual(examples["inputs"][0], i) self.assertEqual(examples["targets"][0], i) self.assertEqual(examples["floats"][0], i + 0.5) # Training data is shuffled. is_shuffled = False for i in xrange(10): examples = session.run(examples_train) self.assertEqual(len(examples["inputs"]), 1) self.assertEqual(len(examples["targets"]), 1) self.assertEqual(examples["inputs"][0], examples["targets"][0]) if examples["inputs"][0] != i: is_shuffled = True self.assertTrue(is_shuffled) # Clean up. os.remove(tmp_file_path + "-train-00000-of-00001") os.remove(tmp_file_path)
def generate_data_one(args): problem, data_dir, tmp_dir, input_file, output_file = args output_fp = open(output_file, "w") for sample in problem.generate_encoded_samples(data_dir, tmp_dir, input_file): print("{}\t{}".format(sample["input_id"][0], sample["inputs_d"]), file=output_fp) output_fp.close() return generator_utils.generate_files( problem._maybe_pack_examples( problem.generate_encoded_samples(data_dir, tmp_dir, input_file)), [output_file])
def testExamplesQueue(self): tf.set_random_seed(1) tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Generate a file with 100 examples. def test_generator(): for i in xrange(100): yield {"inputs": [i], "targets": [i], "floats": [i + 0.5]} generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir) self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001")) examples_train = data_reader.examples_queue( [tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, training=True) examples_eval = data_reader.examples_queue( [tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64), "floats": tf.VarLenFeature(tf.float32) }, training=False) with tf.train.MonitoredSession() as session: # Evaluation data comes in the same order as in the file, check 10. for i in xrange(10): examples = session.run(examples_eval) self.assertEqual(len(examples["inputs"]), 1) self.assertEqual(len(examples["targets"]), 1) self.assertEqual(examples["inputs"][0], i) self.assertEqual(examples["targets"][0], i) self.assertEqual(examples["floats"][0], i + 0.5) # Training data is shuffled. is_shuffled = False for i in xrange(10): examples = session.run(examples_train) self.assertEqual(len(examples["inputs"]), 1) self.assertEqual(len(examples["targets"]), 1) self.assertEqual(examples["inputs"][0], examples["targets"][0]) if examples["inputs"][0] != i: is_shuffled = True self.assertTrue(is_shuffled) # Clean up. os.remove(tmp_file_path + "-00000-of-00001") os.remove(tmp_file_path)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, 10, shuffled=False) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=True) midi_files = glob.glob('data/maestro/maestro-v2.0.0/*/*.midi') random.seed(13) random.shuffle(midi_files) generator_utils.generate_files(self.generator(midi_files[:50]), dev_paths) generator_utils.generate_files(self.generator(midi_files[50:]), train_paths) generator_utils.shuffle_dataset(train_paths)
def encode_dataset(model, dataset, problem, ae_hparams, autoencoder_path, out_files): """Encode all frames in dataset with model and write them out to out_files.""" batch_size = 8 dataset = dataset.batch(batch_size) examples = dataset.make_one_shot_iterator().get_next() images = examples.pop("frame") images = tf.expand_dims(images, 1) encoded = model.encode(images) encoded_frame_height = int( math.ceil(problem.frame_height / 2**ae_hparams.num_hidden_layers)) encoded_frame_width = int( math.ceil(problem.frame_width / 2**ae_hparams.num_hidden_layers)) num_bits = 8 encoded = tf.reshape( encoded, [-1, encoded_frame_height, encoded_frame_width, 3, num_bits]) encoded = tf.cast(discretization.bit_to_int(encoded, num_bits), tf.uint8) pngs = tf.map_fn(tf.image.encode_png, encoded, dtype=tf.string, back_prop=False) with tf.Session() as sess: autoencoder_saver = tf.train.Saver(tf.global_variables("autoencoder.*")) trainer_lib.restore_checkpoint(autoencoder_path, autoencoder_saver, sess, must_restore=True) def generator(): """Generate examples.""" while True: try: pngs_np, examples_np = sess.run([pngs, examples]) rewards_np = [list(el) for el in examples_np["reward"]] actions_np = [list(el) for el in examples_np["action"]] pngs_np = [el for el in pngs_np] for action, reward, png in zip(actions_np, rewards_np, pngs_np): yield { "action": action, "reward": reward, "image/encoded": [png], "image/format": ["png"], "image/height": [encoded_frame_height], "image/width": [encoded_frame_width], } except tf.errors.OutOfRangeError: break generator_utils.generate_files( generator(), out_files, cycle_every_n=problem.total_number_of_frames // 10)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def testGenerateFiles(self): tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Generate a trivial file and assert the file exists. def test_generator(): yield {"inputs": [1], "target": [1]} generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir) self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001")) # Clean up. os.remove(tmp_file_path + "-00000-of-00001") os.remove(tmp_file_path)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """ TODO """ train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data_for_problem(problem): """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS.""" training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[problem] num_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) tf.logging.info("Generating development data for %s.", problem) dev_output_files = generator_utils.dev_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, 1) generator_utils.generate_files(dev_gen(), dev_output_files) all_output_files = train_output_files + dev_output_files generator_utils.shuffle_dataset(all_output_files)
def generate_data_for_problem(problem): """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS.""" training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[problem] num_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) tf.logging.info("Generating development data for %s.", problem) dev_output_files = generator_utils.dev_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, 1) generator_utils.generate_files(dev_gen(), dev_output_files) all_output_files = train_output_files + dev_output_files generator_utils.shuffle_dataset(all_output_files)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) # problem_name-train-00000-of-00001 dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) # problem_name-dev-00000-of-00001 if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_t2t_example(uni, svg): print(f'{bcolors.BOLD}Generating tfrecord...{bcolors.ENDC}', end='') path, width, vwidth = _parse_svg(uni, svg) errorString = None if _is_valid_glyph(uni, width, vwidth): if len(path) > maxpaths: # too many paths! errorString = f'{chr(uni)} ({uni}) has too many paths: {len(path)}' elif len(path) == 0: # no paths! errorString = f'{chr(uni)} ({uni}) has no paths' else: # super clunky but we have to get our example in the right format example = _create_example(uni, path) tempexamplefile = tempfile.NamedTemporaryFile(mode='w', delete=False) tempexamplefile.close() Path(tempexamplefile.name).unlink( ) # we must delete before we generate_files generator_utils.generate_files(_generate_sample(example), [tempexamplefile.name], max_cases=1) # https://www.tensorflow.org/tutorials/load_data/tfrecord raw_dataset = tf.data.TFRecordDataset([tempexamplefile.name]) for raw_record in raw_dataset.take(1): example = raw_record Path(tempexamplefile.name).unlink() # delete for real print(f'{bcolors.OKGREEN}SUCCESS{bcolors.ENDC}') return {'error': None, 'example': example} else: errorString = f'{chr(uni)} ({uni}) is invalid' print(f'{bcolors.FAIL}{errorString}{bcolors.ENDC}') return {'error': errorString, 'example': None}
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths(data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, TEST_DATASET), test_paths) all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, TRAIN_DATASET), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates training/dev data. Args: data_dir: a string tmp_dir: a string task_id: an optional integer Returns: shard or shards for which data was generated. """ tf.logging.info("generate_data task_id=%s" % task_id) encoder = self.get_or_create_vocab(data_dir, tmp_dir) assert task_id >= 0 and task_id < self.num_generate_tasks if task_id < self.num_train_shards: out_file = self.training_filepaths( data_dir, self.num_train_shards, shuffled=False)[task_id] else: out_file = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False)[task_id - self.num_train_shards] generator_utils.generate_files( self.example_generator(encoder, tmp_dir, task_id), [out_file]) generator_utils.shuffle_dataset([out_file])
def generate_data(self, data_dir, tmp_dir=None, task_id=-1): """Saves the current epoch rollouts to disk, split into train/dev sets.""" if not self._rollouts_by_epoch_and_split[self.current_epoch]: # Data not loaded from disk. self._split_current_epoch() rollouts_by_split = self._rollouts_by_epoch_and_split[self.current_epoch] splits_and_paths = self.splits_and_paths(data_dir) for (split, paths) in splits_and_paths: rollouts = rollouts_by_split[split] num_frames = self._calc_num_frames(rollouts) shard_size = num_frames // len(paths) frame_gen = self._generate_frames(rollouts) for (path_index, path) in enumerate(paths): limit = shard_size # Put the remainder in the last shard to preserve the ordering. if path_index == len(paths) - 1: limit = None generator_utils.generate_files( itertools.islice(frame_gen, limit), [path], cycle_every_n=float("inf") )
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_gen = self.generator(tmp_dir, 162770) train_paths = self.training_filepaths( data_dir, self.train_shards, shuffled=False) generator_utils.generate_files(train_gen, train_paths) dev_gen = self.generator(tmp_dir, 19867, 162770) dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False) generator_utils.generate_files(dev_gen, dev_paths) test_gen = self.generator(tmp_dir, 19962, 162770+19867) test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False) generator_utils.generate_files(test_gen, test_paths) generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): if task_id == -1 or task_id is None: for i in range(FLAGS.wiki_revision_num_train_shards + FLAGS.wiki_revision_num_dev_shards): self.generate_data(data_dir, tmp_dir, i) return tf.logging.info( "Flags for job (task_id {}): " "Dev shards: {}, Train shards: {}, " "Revision skip factor: {}, Max page size: 2**{}, Introduce errors: {}," "Percent Identical Examples: {}" "".format(task_id, FLAGS.wiki_revision_num_dev_shards, FLAGS.wiki_revision_num_train_shards, FLAGS.wiki_revision_revision_skip_factor, FLAGS.wiki_revision_max_page_size_exp, FLAGS.wiki_revision_introduce_errors, FLAGS.wiki_revision_percent_identical_examples)) if FLAGS.wiki_revision_vocab_file: encoder = wiki_revision_utils.get_encoder_from_vocab( FLAGS.wiki_revision_vocab_file) else: encoder = wiki_revision_utils.get_or_generate_vocabulary( data_dir, tmp_dir, FLAGS.wiki_revision_data_prefix, FLAGS.wiki_revision_max_page_size_exp, self.approx_vocab_size, self.strip) random.seed(123) if task_id < FLAGS.wiki_revision_num_train_shards: out_file = self.training_filepaths( data_dir, FLAGS.wiki_revision_num_train_shards, shuffled=False)[task_id] else: out_file = self.dev_filepaths( data_dir, FLAGS.wiki_revision_num_dev_shards, shuffled=False)[task_id - FLAGS.wiki_revision_num_train_shards] tf.logging.info("Generating files for path: %s", out_file) self.corpus_files = wiki_revision_utils.corpus_files_for_shard( task_id, FLAGS.wiki_revision_num_train_shards, FLAGS.wiki_revision_num_dev_shards, FLAGS.wiki_revision_data_prefix) example_generator = self.generator(encoder, self.corpus_files, tmp_dir) packed_example_generator = self._maybe_pack_examples(example_generator) generator_utils.generate_files(packed_example_generator, [out_file]) generator_utils.shuffle_dataset([out_file]) tf.logging.info( "Job stats: identity examples: {}, total examples {}, ratio: {}".format( self.num_identity_examples, self.num_total_examples, (1 + self.num_identity_examples) / (1 + self.num_total_examples))) job_stats_string = self.aggregate_job_stats() out_dir, filename = out_file.replace("-unshuffled", "").rsplit("/", 1) stats_prefix = "/stats_" stats_file_path = "".join([out_dir, stats_prefix, filename]) if tf.gfile.Exists( stats_file_path) and tf.gfile.Open(stats_file_path).size() != 0: tf.logging.info("Skipping writing stats because output file exists.") else: with tf.gfile.Open(stats_file_path, "w") as out: tf.logging.info("Writing job stats to {}".format(stats_file_path)) out.write(job_stats_string) tf.logging.info(job_stats_string)
def produce_examples(shard_ids, wikis_dir, refs_dir, urls_dir, vocab_path, out_filepaths): """Produce examples from shard_ids to out_filepaths.""" # * Join the Wikipedia articles with their references # * Run Tf-idf to sort reference paragraphs # * Encode the Wikipedia and reference text with the vocabulary # * Write out TFRecords of tensorflow.Example tf.logging.info("Processing %d input shards into %d output files.", len(shard_ids), len(out_filepaths)) vocab = text_encoder.SubwordTextEncoder(vocab_path) eot_ids = vocab.encode(EOT) def example_generator(): """Generate Example dicts.""" stats = dict(total_original_wikis=0, total_original_refs=0, total_found_refs=0, ref_lengths=[], wiki_original_refs=[], wiki_found_refs=[], wikis_skipped_no_refs=0, wikis_skipped_short_lead=0, num_wikis_written=0) ref_files_by_shard = _references_files_by_shard(refs_dir) for shard_id in shard_ids: tf.logging.info("Processing shard %d", shard_id) wiki_urls = _wiki_urls_for_shard(shard_id, urls_dir) tf.logging.info("Loaded wiki URLs for shard") refs_content = _references_content(ref_files_by_shard[shard_id]) tf.logging.info("Loaded reference content for shard") for i, wiki in enumerate(_wiki_articles(shard_id, wikis_dir)): if not i % 1000: tf.logging.info("Processing wiki index %d for shard %d", i, shard_id) stats["total_original_wikis"] += 1 # Get reference content wiki_ref_content = [] ref_urls = wiki_urls[wiki.url]["refs"] stats["total_original_refs"] += len(ref_urls) stats_wiki_original_refs = len(ref_urls) stats_wiki_found_refs = 0 for ref_url in ref_urls: ref_content = refs_content.get(ref_url) if not ref_content: continue stats["total_found_refs"] += 1 stats["ref_lengths"].append(len(ref_content)) stats_wiki_found_refs += 1 wiki_ref_content.append(ref_content) stats["wiki_original_refs"].append(stats_wiki_original_refs) stats["wiki_found_refs"].append(stats_wiki_found_refs) if not wiki_ref_content or len(wiki_ref_content) < _MIN_REFS: # No/few refs were found stats["wikis_skipped_no_refs"] += 1 continue # Rank reference paragraphs with TFIDF wiki_title = _normalize_text(wiki.title) ranked_paragraphs = rank_reference_paragraphs(wiki_title, wiki_ref_content) # Construct inputs from Wiki title and references inputs = [] inputs.extend(vocab.encode(wiki_title)) inputs.extend(eot_ids) for paragraph in ranked_paragraphs: if len(inputs) >= 1e6: break paragraph += " " inputs.extend(vocab.encode(paragraph)) # Construct targets from article sections targets, section_boundaries = _encode_wiki_sections( wiki.sections, vocab) # Skip if lead section is too short if (not section_boundaries or section_boundaries[0] < _MIN_LEADSECTION_TOKENS): stats["wikis_skipped_short_lead"] += 1 continue inputs.append(text_encoder.EOS_ID) targets.append(text_encoder.EOS_ID) stats["num_wikis_written"] += 1 yield { "inputs": inputs, "targets": targets, "section_boundaries": section_boundaries, } tf.logging.info("Total: %d, Skipped: %d", stats["num_wikis_written"], stats["total_original_wikis"] - stats["num_wikis_written"]) tf.logging.info("Total refs: %d, Skipped refs: %d", stats["total_found_refs"], stats["total_original_refs"] - stats["total_found_refs"]) stats_fname = os.path.join(os.path.split(out_filepaths[0])[0], "stats.%d.json" % shard_ids[0]) with tf.gfile.Open(stats_fname, "w") as f: f.write(json.dumps(stats)) generator_utils.generate_files(example_generator(), out_filepaths)