def generate_data(self, data_dir, _, num_shards=100): utils.generate_dataset_and_shuffle( identity_generator(self.num_symbols, 40, 100000), self.training_filepaths(data_dir, num_shards, shuffled=True), identity_generator(self.num_symbols, 400, 10000), self.dev_filepaths(data_dir, 1, shuffled=True), shuffle=False)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths(data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates LSUN bedrooms dataset and writes it in data_dir.""" generator_utils.generate_dataset_and_shuffle( self.read_and_convert_to_png(tmp_dir, "train"), self.training_filepaths(data_dir, 100, shuffled=False), self.read_and_convert_to_png(tmp_dir, "val"), self.dev_filepaths(data_dir, 1, shuffled=False))
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False) generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( self.generator(tmp_dir, 162770), # train self.training_filepaths(data_dir, self.train_shards, shuffled=False), self.generator(tmp_dir, 19867, 162770), # dev self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates LSUN bedrooms dataset and writes it in data_dir.""" generator_utils.generate_dataset_and_shuffle( self.read_and_convert_to_png(tmp_dir, "train"), self.training_filepaths(data_dir, 100, shuffled=False), self.read_and_convert_to_png(tmp_dir, "val"), self.dev_filepaths(data_dir, 1, shuffled=False))
def generate_data(self, data_dir, _, num_shards=None): if num_shards is None: num_shards = self.num_shards def generator_eos(generator): """Shift by NUM_RESERVED_IDS and append EOS token.""" for case in generator: new_case = {} for feature in case: new_case[feature] = [ i + text_encoder.NUM_RESERVED_TOKENS for i in case[feature] ] + [text_encoder.EOS_ID] yield new_case train_generator_eos = lambda: generator_eos( # pylint: disable=g-long-lambda self.train_generator(self.num_symbols, self.train_length, self. train_size)) dev_generator_eos = lambda: generator_eos( # pylint: disable=g-long-lambda self.dev_generator(self.num_symbols, self.dev_length, self.dev_size )) utils.generate_dataset_and_shuffle(train_generator_eos(), self.training_filepaths( data_dir, num_shards, shuffled=True), dev_generator_eos(), self.dev_filepaths(data_dir, 1, shuffled=True), shuffle=False)
def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), self.training_filepaths(data_dir, self.train_shards, shuffled=True), self.generator(data_dir, tmp_dir, False), self.dev_filepaths(data_dir, self.dev_shards, shuffled=True))
def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), self.training_filepaths(data_dir, self.train_shards, shuffled=False), self.generator(data_dir, tmp_dir, problem.DatasetSplit.EVAL), self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False) generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): import logging logging.info('[DBG] Generating data') generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), self.training_filepaths(data_dir, self.train_shards, shuffled=True), self.generator(data_dir, tmp_dir, False), self.dev_filepaths(data_dir, self.dev_shards, shuffled=True))
def generate_data(self, data_dir, _, task_id=-1): utils.generate_dataset_and_shuffle( self.generate_encoded_samples(self.num_symbols, self.train_length, self.train_size), self.training_filepaths(data_dir, self.num_shards, shuffled=True), self.generate_encoded_samples(self.num_symbols, self.dev_length, self.dev_size), self.dev_filepaths(data_dir, 1, shuffled=True), shuffle=False)
def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice", self.source_vocab_size, self.targeted_vocab_size), self.training_filepaths(data_dir, self.num_shards, shuffled=False), tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice", self.source_vocab_size, self.targeted_vocab_size), self.dev_filepaths(data_dir, 1, shuffled=False))
def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice", self.source_vocab_size, self.targeted_vocab_size), self.training_filepaths(data_dir, self.num_shards, shuffled=False), tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice", self.source_vocab_size, self.targeted_vocab_size), self.dev_filepaths(data_dir, 1, shuffled=False))
def generate_data(self, data_dir, tmp_dir, task_id=-1): '''Generate `tf.example` data from text file, output the generated data to `data_dir`. Called by `t2t-datagen` ''' train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False) generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): import resource low, high = resource.getrlimit(resource.RLIMIT_NOFILE) print('DBG', low, high) resource.setrlimit(resource.RLIMIT_NOFILE, (high, high)) generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), self.training_filepaths(data_dir, self.train_shards, shuffled=True), self.generator(data_dir, tmp_dir, False), self.dev_filepaths(data_dir, self.dev_shards, shuffled=True))
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, num_shards=None): if num_shards is None: num_shards = self.num_shards if self.is_character_level: generator_utils.generate_dataset_and_shuffle( self.train_generator(tmp_dir, True), self.training_filepaths(data_dir, num_shards, shuffled=False), self.dev_generator(tmp_dir, False), self.dev_filepaths(data_dir, 1, shuffled=False)) else: generator_utils.generate_dataset_and_shuffle( self.train_generator(tmp_dir, True, self.targeted_vocab_size), self.training_filepaths(data_dir, num_shards, shuffled=False), self.dev_generator(tmp_dir, False, self.targeted_vocab_size), self.dev_filepaths(data_dir, 1, shuffled=False))
def generate_data(self, data_dir, tmp_dir, task_id=-1): """ TODO """ train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates tf-refords for the problem. Args: data_dir: String. The directory to generate TF-Records to. tmp_dir: String. The directory to download the unprocessed data to. Returns: None. """ self.maybe_download_and_unzip(tmp_dir) generator_utils.generate_dataset_and_shuffle( self.generator(tmp_dir, is_training=True), self.training_filepaths(data_dir, 100, shuffled=True), self.generator(tmp_dir, is_training=False), self.dev_filepaths(data_dir, 1, shuffled=True))
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) # problem_name-train-00000-of-00001 dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) # problem_name-dev-00000-of-00001 if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False) train_examples = self._examples(data_dir, tmp_dir, train=True) dev_examples = self._examples(data_dir, tmp_dir, train=False) encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, (e['sentence1'] + ' ' + e['sentence2'] for e in train_examples + dev_examples)) generator_utils.generate_dataset_and_shuffle( self._inputs_and_targets(encoder, train_examples), train_paths, self._inputs_and_targets(encoder, dev_examples), dev_paths)
def main(): data_dir = os.path.expanduser(FLAGS.data_dir) tmp_dir = os.path.expanduser(FLAGS.tmp_dir) p = DeepseaProblem() p.maybe_download_and_unzip(FLAGS.tmp_dir) def capped_generator(tmp_dir, is_training, max_to_gen): g = p.generator(tmp_dir, is_training) for _, example_dict in zip(xrange(max_to_gen), g): yield example_dict # Generate a subset of the data. generator_utils.generate_dataset_and_shuffle( capped_generator(tmp_dir, True, FLAGS.num_train), p.training_filepaths(data_dir, 1, shuffled=True), capped_generator(tmp_dir, False, FLAGS.num_dev), p.dev_filepaths(data_dir, 1, shuffled=True))
def generate_data(self, data_dir, _, task_id=-1): def generator_eos(nbr_symbols, max_length, nbr_cases): """Shift by NUM_RESERVED_IDS and append EOS token.""" for case in self.generator(nbr_symbols, max_length, nbr_cases): new_case = {} for feature in case: new_case[feature] = [i + text_encoder.NUM_RESERVED_TOKENS for i in case[feature]] + [ text_encoder.EOS_ID] yield new_case utils.generate_dataset_and_shuffle( generator_eos(self.num_symbols, self.train_length, self.train_size), self.training_filepaths(data_dir, self.num_shards, shuffled=True), generator_eos(self.num_symbols, self.dev_length, self.dev_size), self.dev_filepaths(data_dir, 1, shuffled=True), shuffle=False)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False) train_examples = self._examples(data_dir, tmp_dir, train=True) dev_examples = self._examples(data_dir, tmp_dir, train=False) encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, (e['sentence1'] + ' ' + e['sentence2'] for e in train_examples + dev_examples) ) generator_utils.generate_dataset_and_shuffle( self._inputs_and_targets(encoder, train_examples), train_paths, self._inputs_and_targets(encoder, dev_examples), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates training/dev data. Args: data_dir: The base directory where data and vocab files are stored. tmp_dir: temp directory to download and extract the dataset task_id: an optional integer """ tmp_dir = babi_qa._prepare_babi_data(tmp_dir, data_dir) babi_qa._build_vocab(self.generate_text_for_vocab(data_dir, tmp_dir), data_dir, self.vocab_filename) train_paths = self.training_filepaths(data_dir, self.num_train_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False) metadata_path = os.path.join(data_dir, self.meta_data_filename()) train_parsed = babi_qa._babi_parser(tmp_dir, self.babi_task_id, self.babi_subset, problem.DatasetSplit.TRAIN, self.joint_training) dev_parsed = babi_qa._babi_parser(tmp_dir, self.babi_task_id, self.babi_subset, problem.DatasetSplit.EVAL, self.joint_training) encoder = self.get_or_create_vocab(data_dir) label_encoder = self.get_labels_encoder(data_dir) train_parsed_processed, dev_parsed_processed = ( self._preprocess_babi_input_data(train_parsed, dev_parsed, encoder, label_encoder, metadata_path)) generator_utils.generate_dataset_and_shuffle( self.generator(train_parsed_processed), train_paths, self.generator(dev_parsed_processed), dev_paths)
def generate_data(self, data_dir, _, task_id=-1): def generator_eos(nbr_symbols, max_length, nbr_cases): """Shift by NUM_RESERVED_IDS and append EOS token.""" for case in self.generator(nbr_symbols, max_length, nbr_cases): new_case = {} for feature in case: new_case[feature] = [ i + text_encoder.NUM_RESERVED_TOKENS for i in case[feature] ] + [text_encoder.EOS_ID] yield new_case utils.generate_dataset_and_shuffle( generator_eos(self.num_symbols, self.train_length, self.train_size), self.training_filepaths(data_dir, self.num_shards, shuffled=True), generator_eos(self.num_symbols, self.dev_length, self.dev_size), self.dev_filepaths(data_dir, 1, shuffled=True), shuffle=False)
def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), self.training_filepaths(data_dir, self.train_shards, shuffled=False), self.generator(data_dir, tmp_dir, problem.DatasetSplit.EVAL), self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( self.generator(tmp_dir, 162770), # train self.training_filepaths(data_dir, self.train_shards, shuffled=False), self.generator(tmp_dir, 19867, 162770), # dev self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
def generate_data(self, data_dir, tmp_dir, num_shards=100): generator_utils.generate_dataset_and_shuffle( ende_wordpiece_token_generator(tmp_dir, True, self.target_vocab_size), self.training_filepaths(data_dir, num_shards, shuffled=False), ende_wordpiece_token_generator(tmp_dir, False, self.target_vocab_size), self.dev_filepaths(data_dir, 1, shuffled=False))
def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), self.training_filepaths(data_dir, self.train_shards, shuffled=True), self.generator(data_dir, tmp_dir, False), self.dev_filepaths(data_dir, self.dev_shards, shuffled=True))