コード例 #1
0
 def generate_data(self, data_dir, _, num_shards=100):
     utils.generate_dataset_and_shuffle(
         identity_generator(self.num_symbols, 40, 100000),
         self.training_filepaths(data_dir, num_shards, shuffled=True),
         identity_generator(self.num_symbols, 400, 10000),
         self.dev_filepaths(data_dir, 1, shuffled=True),
         shuffle=False)
コード例 #2
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.num_dev_shards,
                                       shuffled=False)
        test_paths = self.test_filepaths(data_dir,
                                         self.num_test_shards,
                                         shuffled=True)

        generator_utils.generate_files(
            self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths)

        if self.use_train_shards_for_dev:
            all_paths = train_paths + dev_paths
            generator_utils.generate_files(
                self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS),
                all_paths)
            generator_utils.shuffle_dataset(all_paths)
        else:
            generator_utils.generate_dataset_and_shuffle(
                self.generator(data_dir, tmp_dir,
                               self.TRAIN_DATASETS), train_paths,
                self.generator(data_dir, tmp_dir, self.DEV_DATASETS),
                dev_paths)
コード例 #3
0
ファイル: image_lsun.py プロジェクト: qixiuai/tensor2tensor
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   """Generates LSUN bedrooms dataset and writes it in data_dir."""
   generator_utils.generate_dataset_and_shuffle(
       self.read_and_convert_to_png(tmp_dir, "train"),
       self.training_filepaths(data_dir, 100, shuffled=False),
       self.read_and_convert_to_png(tmp_dir, "val"),
       self.dev_filepaths(data_dir, 1, shuffled=False))
コード例 #4
0
ファイル: imdb.py プロジェクト: AranKomat/tensor2tensor
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   train_paths = self.training_filepaths(
       data_dir, self.num_shards, shuffled=False)
   dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False)
   generator_utils.generate_dataset_and_shuffle(
       self.generator(data_dir, tmp_dir, True), train_paths,
       self.generator(data_dir, tmp_dir, False), dev_paths)
コード例 #5
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     generator_utils.generate_dataset_and_shuffle(
         self.generator(tmp_dir, 162770),  # train
         self.training_filepaths(data_dir, self.train_shards,
                                 shuffled=False),
         self.generator(tmp_dir, 19867, 162770),  # dev
         self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
コード例 #6
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   """Generates LSUN bedrooms dataset and writes it in data_dir."""
   generator_utils.generate_dataset_and_shuffle(
       self.read_and_convert_to_png(tmp_dir, "train"),
       self.training_filepaths(data_dir, 100, shuffled=False),
       self.read_and_convert_to_png(tmp_dir, "val"),
       self.dev_filepaths(data_dir, 1, shuffled=False))
コード例 #7
0
ファイル: algorithmic.py プロジェクト: zabin10/tensor2tensor
    def generate_data(self, data_dir, _, num_shards=None):
        if num_shards is None:
            num_shards = self.num_shards

        def generator_eos(generator):
            """Shift by NUM_RESERVED_IDS and append EOS token."""
            for case in generator:
                new_case = {}
                for feature in case:
                    new_case[feature] = [
                        i + text_encoder.NUM_RESERVED_TOKENS
                        for i in case[feature]
                    ] + [text_encoder.EOS_ID]
                yield new_case

        train_generator_eos = lambda: generator_eos(  # pylint: disable=g-long-lambda
            self.train_generator(self.num_symbols, self.train_length, self.
                                 train_size))
        dev_generator_eos = lambda: generator_eos(  # pylint: disable=g-long-lambda
            self.dev_generator(self.num_symbols, self.dev_length, self.dev_size
                               ))

        utils.generate_dataset_and_shuffle(train_generator_eos(),
                                           self.training_filepaths(
                                               data_dir,
                                               num_shards,
                                               shuffled=True),
                                           dev_generator_eos(),
                                           self.dev_filepaths(data_dir,
                                                              1,
                                                              shuffled=True),
                                           shuffle=False)
コード例 #8
0
ファイル: imagenet.py プロジェクト: y12uc231/BERT-1
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     generator_utils.generate_dataset_and_shuffle(
         self.generator(data_dir, tmp_dir, True),
         self.training_filepaths(data_dir, self.train_shards,
                                 shuffled=True),
         self.generator(data_dir, tmp_dir, False),
         self.dev_filepaths(data_dir, self.dev_shards, shuffled=True))
コード例 #9
0
ファイル: vqa.py プロジェクト: hubayirp/fabric-vsf
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     generator_utils.generate_dataset_and_shuffle(
         self.generator(data_dir, tmp_dir, problem.DatasetSplit.TRAIN),
         self.training_filepaths(data_dir,
                                 self.train_shards,
                                 shuffled=False),
         self.generator(data_dir, tmp_dir, problem.DatasetSplit.EVAL),
         self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
コード例 #10
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     train_paths = self.training_filepaths(data_dir,
                                           self.num_shards,
                                           shuffled=False)
     dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False)
     generator_utils.generate_dataset_and_shuffle(
         self.generator(data_dir, tmp_dir, True), train_paths,
         self.generator(data_dir, tmp_dir, False), dev_paths)
コード例 #11
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     import logging
     logging.info('[DBG] Generating data')
     generator_utils.generate_dataset_and_shuffle(
         self.generator(data_dir, tmp_dir, True),
         self.training_filepaths(data_dir, self.train_shards,
                                 shuffled=True),
         self.generator(data_dir, tmp_dir, False),
         self.dev_filepaths(data_dir, self.dev_shards, shuffled=True))
コード例 #12
0
    def generate_data(self, data_dir, _, task_id=-1):

        utils.generate_dataset_and_shuffle(
            self.generate_encoded_samples(self.num_symbols, self.train_length,
                                          self.train_size),
            self.training_filepaths(data_dir, self.num_shards, shuffled=True),
            self.generate_encoded_samples(self.num_symbols, self.dev_length,
                                          self.dev_size),
            self.dev_filepaths(data_dir, 1, shuffled=True),
            shuffle=False)
コード例 #13
0
ファイル: ice_parsing.py プロジェクト: hubayirp/fabric-vsf
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     generator_utils.generate_dataset_and_shuffle(
         tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice",
                                        self.source_vocab_size,
                                        self.targeted_vocab_size),
         self.training_filepaths(data_dir, self.num_shards, shuffled=False),
         tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice",
                                        self.source_vocab_size,
                                        self.targeted_vocab_size),
         self.dev_filepaths(data_dir, 1, shuffled=False))
コード例 #14
0
ファイル: ice_parsing.py プロジェクト: chqiwang/tensor2tensor
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   generator_utils.generate_dataset_and_shuffle(
       tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice",
                                      self.source_vocab_size,
                                      self.targeted_vocab_size),
       self.training_filepaths(data_dir, self.num_shards, shuffled=False),
       tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice",
                                      self.source_vocab_size,
                                      self.targeted_vocab_size),
       self.dev_filepaths(data_dir, 1, shuffled=False))
コード例 #15
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     '''Generate `tf.example` data from text file, output the generated data to 
 `data_dir`. Called by `t2t-datagen`
 '''
     train_paths = self.training_filepaths(data_dir,
                                           self.num_shards,
                                           shuffled=False)
     dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False)
     generator_utils.generate_dataset_and_shuffle(
         self.generator(data_dir, tmp_dir, True), train_paths,
         self.generator(data_dir, tmp_dir, False), dev_paths)
コード例 #16
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        import resource
        low, high = resource.getrlimit(resource.RLIMIT_NOFILE)
        print('DBG', low, high)
        resource.setrlimit(resource.RLIMIT_NOFILE, (high, high))

        generator_utils.generate_dataset_and_shuffle(
            self.generator(data_dir, tmp_dir, True),
            self.training_filepaths(data_dir, self.train_shards,
                                    shuffled=True),
            self.generator(data_dir, tmp_dir, False),
            self.dev_filepaths(data_dir, self.dev_shards, shuffled=True))
コード例 #17
0
ファイル: librispeech.py プロジェクト: zeyu-h/tensor2tensor
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   train_paths = self.training_filepaths(
       data_dir, self.num_shards, shuffled=False)
   dev_paths = self.dev_filepaths(
       data_dir, self.num_dev_shards, shuffled=False)
   if self.use_train_shards_for_dev:
     all_paths = train_paths + dev_paths
     generator_utils.generate_files(
         self.generator(data_dir, tmp_dir, True), all_paths)
     generator_utils.shuffle_dataset(all_paths)
   else:
     generator_utils.generate_dataset_and_shuffle(
         self.generator(data_dir, tmp_dir, True), train_paths,
         self.generator(data_dir, tmp_dir, False), dev_paths)
コード例 #18
0
 def generate_data(self, data_dir, tmp_dir, num_shards=None):
     if num_shards is None:
         num_shards = self.num_shards
     if self.is_character_level:
         generator_utils.generate_dataset_and_shuffle(
             self.train_generator(tmp_dir, True),
             self.training_filepaths(data_dir, num_shards, shuffled=False),
             self.dev_generator(tmp_dir, False),
             self.dev_filepaths(data_dir, 1, shuffled=False))
     else:
         generator_utils.generate_dataset_and_shuffle(
             self.train_generator(tmp_dir, True, self.targeted_vocab_size),
             self.training_filepaths(data_dir, num_shards, shuffled=False),
             self.dev_generator(tmp_dir, False, self.targeted_vocab_size),
             self.dev_filepaths(data_dir, 1, shuffled=False))
コード例 #19
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     """ TODO """
     train_paths = self.training_filepaths(
         data_dir, self.num_shards, shuffled=False)
     dev_paths = self.dev_filepaths(
         data_dir, self.num_dev_shards, shuffled=False)
     if self.use_train_shards_for_dev:
         all_paths = train_paths + dev_paths
         generator_utils.generate_files(
             self.generator(data_dir, tmp_dir, True), all_paths)
         generator_utils.shuffle_dataset(all_paths)
     else:
         generator_utils.generate_dataset_and_shuffle(
             self.generator(data_dir, tmp_dir, True), train_paths,
             self.generator(data_dir, tmp_dir, False), dev_paths)
コード例 #20
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        """Generates tf-refords for the problem.

    Args:
      data_dir: String. The directory to generate TF-Records to.
      tmp_dir: String. The directory to download the unprocessed data to.

    Returns:
      None.
    """
        self.maybe_download_and_unzip(tmp_dir)
        generator_utils.generate_dataset_and_shuffle(
            self.generator(tmp_dir, is_training=True),
            self.training_filepaths(data_dir, 100, shuffled=True),
            self.generator(tmp_dir, is_training=False),
            self.dev_filepaths(data_dir, 1, shuffled=True))
コード例 #21
0
ファイル: problem.py プロジェクト: zqma2/RL4NMT
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     train_paths = self.training_filepaths(
         data_dir, self.num_shards,
         shuffled=False)  # problem_name-train-00000-of-00001
     dev_paths = self.dev_filepaths(
         data_dir, self.num_dev_shards,
         shuffled=False)  # problem_name-dev-00000-of-00001
     if self.use_train_shards_for_dev:
         all_paths = train_paths + dev_paths
         generator_utils.generate_files(
             self.generator(data_dir, tmp_dir, True), all_paths)
         generator_utils.shuffle_dataset(all_paths)
     else:
         generator_utils.generate_dataset_and_shuffle(
             self.generator(data_dir, tmp_dir, True), train_paths,
             self.generator(data_dir, tmp_dir, False), dev_paths)
コード例 #22
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False)

        train_examples = self._examples(data_dir, tmp_dir, train=True)
        dev_examples = self._examples(data_dir, tmp_dir, train=False)

        encoder = generator_utils.get_or_generate_vocab_inner(
            data_dir, self.vocab_file, self.targeted_vocab_size,
            (e['sentence1'] + ' ' + e['sentence2']
             for e in train_examples + dev_examples))

        generator_utils.generate_dataset_and_shuffle(
            self._inputs_and_targets(encoder, train_examples), train_paths,
            self._inputs_and_targets(encoder, dev_examples), dev_paths)
コード例 #23
0
def main():
    data_dir = os.path.expanduser(FLAGS.data_dir)
    tmp_dir = os.path.expanduser(FLAGS.tmp_dir)
    p = DeepseaProblem()
    p.maybe_download_and_unzip(FLAGS.tmp_dir)

    def capped_generator(tmp_dir, is_training, max_to_gen):
        g = p.generator(tmp_dir, is_training)
        for _, example_dict in zip(xrange(max_to_gen), g):
            yield example_dict

    # Generate a subset of the data.
    generator_utils.generate_dataset_and_shuffle(
        capped_generator(tmp_dir, True, FLAGS.num_train),
        p.training_filepaths(data_dir, 1, shuffled=True),
        capped_generator(tmp_dir, False, FLAGS.num_dev),
        p.dev_filepaths(data_dir, 1, shuffled=True))
コード例 #24
0
    def generate_data(self, data_dir, _, task_id=-1):
        def generator_eos(nbr_symbols, max_length, nbr_cases):
            """Shift by NUM_RESERVED_IDS and append EOS token."""
            for case in self.generator(nbr_symbols, max_length, nbr_cases):
                new_case = {}
                for feature in case:
                    new_case[feature] = [i + text_encoder.NUM_RESERVED_TOKENS
                                         for i in case[feature]] + [
                                            text_encoder.EOS_ID]
                yield new_case

        utils.generate_dataset_and_shuffle(
            generator_eos(self.num_symbols, self.train_length, self.train_size),
            self.training_filepaths(data_dir, self.num_shards, shuffled=True),
            generator_eos(self.num_symbols, self.dev_length, self.dev_size),
            self.dev_filepaths(data_dir, 1, shuffled=True),
            shuffle=False)
コード例 #25
0
ファイル: multinli.py プロジェクト: AranKomat/tensor2tensor
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    train_paths = self.training_filepaths(
        data_dir, self.num_shards, shuffled=False)
    dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False)

    train_examples = self._examples(data_dir, tmp_dir, train=True)
    dev_examples = self._examples(data_dir, tmp_dir, train=False)

    encoder = generator_utils.get_or_generate_vocab_inner(
        data_dir, self.vocab_file, self.targeted_vocab_size,
        (e['sentence1'] + ' ' + e['sentence2']
         for e in train_examples + dev_examples)
        )

    generator_utils.generate_dataset_and_shuffle(
        self._inputs_and_targets(encoder, train_examples), train_paths,
        self._inputs_and_targets(encoder, dev_examples), dev_paths)
コード例 #26
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        """Generates training/dev data.


    Args:
      data_dir: The base directory where data and vocab files are stored.
      tmp_dir: temp directory to download and extract the dataset
      task_id: an optional integer
    """

        tmp_dir = babi_qa._prepare_babi_data(tmp_dir, data_dir)

        babi_qa._build_vocab(self.generate_text_for_vocab(data_dir, tmp_dir),
                             data_dir, self.vocab_filename)

        train_paths = self.training_filepaths(data_dir,
                                              self.num_train_shards,
                                              shuffled=False)

        dev_paths = self.dev_filepaths(data_dir,
                                       self.num_dev_shards,
                                       shuffled=False)

        metadata_path = os.path.join(data_dir, self.meta_data_filename())

        train_parsed = babi_qa._babi_parser(tmp_dir, self.babi_task_id,
                                            self.babi_subset,
                                            problem.DatasetSplit.TRAIN,
                                            self.joint_training)

        dev_parsed = babi_qa._babi_parser(tmp_dir, self.babi_task_id,
                                          self.babi_subset,
                                          problem.DatasetSplit.EVAL,
                                          self.joint_training)

        encoder = self.get_or_create_vocab(data_dir)
        label_encoder = self.get_labels_encoder(data_dir)

        train_parsed_processed, dev_parsed_processed = (
            self._preprocess_babi_input_data(train_parsed, dev_parsed, encoder,
                                             label_encoder, metadata_path))

        generator_utils.generate_dataset_and_shuffle(
            self.generator(train_parsed_processed), train_paths,
            self.generator(dev_parsed_processed), dev_paths)
コード例 #27
0
ファイル: algorithmic.py プロジェクト: qixiuai/tensor2tensor
  def generate_data(self, data_dir, _, task_id=-1):

    def generator_eos(nbr_symbols, max_length, nbr_cases):
      """Shift by NUM_RESERVED_IDS and append EOS token."""
      for case in self.generator(nbr_symbols, max_length, nbr_cases):
        new_case = {}
        for feature in case:
          new_case[feature] = [
              i + text_encoder.NUM_RESERVED_TOKENS for i in case[feature]
          ] + [text_encoder.EOS_ID]
        yield new_case

    utils.generate_dataset_and_shuffle(
        generator_eos(self.num_symbols, self.train_length, self.train_size),
        self.training_filepaths(data_dir, self.num_shards, shuffled=True),
        generator_eos(self.num_symbols, self.dev_length, self.dev_size),
        self.dev_filepaths(data_dir, 1, shuffled=True),
        shuffle=False)
コード例 #28
0
ファイル: vqa.py プロジェクト: qixiuai/tensor2tensor
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   generator_utils.generate_dataset_and_shuffle(
       self.generator(data_dir, tmp_dir, problem.DatasetSplit.TRAIN),
       self.training_filepaths(data_dir, self.train_shards, shuffled=False),
       self.generator(data_dir, tmp_dir, problem.DatasetSplit.EVAL),
       self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
コード例 #29
0
ファイル: celeba.py プロジェクト: kltony/tensor2tensor
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   generator_utils.generate_dataset_and_shuffle(
       self.generator(tmp_dir, 162770),  # train
       self.training_filepaths(data_dir, self.train_shards, shuffled=False),
       self.generator(tmp_dir, 19867, 162770),  # dev
       self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
コード例 #30
0
 def generate_data(self, data_dir, tmp_dir, num_shards=100):
   generator_utils.generate_dataset_and_shuffle(
       ende_wordpiece_token_generator(tmp_dir, True, self.target_vocab_size),
       self.training_filepaths(data_dir, num_shards, shuffled=False),
       ende_wordpiece_token_generator(tmp_dir, False, self.target_vocab_size),
       self.dev_filepaths(data_dir, 1, shuffled=False))
コード例 #31
0
ファイル: imagenet.py プロジェクト: qixiuai/tensor2tensor
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   generator_utils.generate_dataset_and_shuffle(
       self.generator(data_dir, tmp_dir, True),
       self.training_filepaths(data_dir, self.train_shards, shuffled=True),
       self.generator(data_dir, tmp_dir, False),
       self.dev_filepaths(data_dir, self.dev_shards, shuffled=True))