コード例 #1
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     dataset_path = (ENZH_RAW_DATASETS["TRAIN"]
                     if train else ENZH_RAW_DATASETS["DEV"])
     train_path = get_enzh_raw_dataset(tmp_dir, dataset_path)
     return text_problems.text2text_txt_iterator(train_path + ".en",
                                                 train_path + ".zh")
コード例 #2
0
ファイル: translate.py プロジェクト: qixiuai/tensor2tensor
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
   datasets = self.source_data_files(dataset_split)
   tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
   data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name,
                                                                   tag))
   return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                               data_path + ".lang2")
コード例 #3
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = self.source_data_files(dataset_split)
     """
     for item in datasets:
         dummy_file_name = item[0].split('/')[-1]
         create_dummy_tar(tmp_dir, dummy_file_name)
         s_file, t_file = item[1][0], item[1][1]
         if not os.path.exists(os.path.join(tmp_dir, s_file)):
             raise Exception("Be sure file '%s' is exists in tmp dir" % s_file)
         if not os.path.exists(os.path.join(tmp_dir, t_file)):
             raise Exception("Be sure file '%s' is exists in tmp dir" % t_file)
     """
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     source_vocab_filename = os.path.join(data_dir, self.source_vocab_name)
     target_vocab_filename = os.path.join(data_dir, self.target_vocab_name)
     source_encoder = text_encoder.TokenTextEncoder(
         source_vocab_filename, replace_oov=self.oov_token)
     target_encoder = text_encoder.TokenTextEncoder(
         target_vocab_filename, replace_oov=self.oov_token)
     tag = "train" if train else "dev"
     filename_base = "%s-compiled-%s" % (self.name, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + '.lang1',
                                              data_path + '.lang2'),
         source_encoder, target_encoder)
コード例 #4
0
ファイル: translate.py プロジェクト: zhyq/tensor2tensor
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
     datasets = self.source_data_files(dataset_split)
     tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
     data_path = compile_data(tmp_dir, datasets,
                              "%s-compiled-%s" % (self.name, tag))
     return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2")
コード例 #5
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):

        # Vocab
        src_token_path = (os.path.join(data_dir, self.source_vocab_name),
                          self.source_vocab_name)
        target_token_path = (os.path.join(data_dir, self.target_vocab_name),
                             self.target_vocab_name)

        for token_path, vocab_name in [src_token_path, target_token_path]:
            if not tf.gfile.Exists(token_path):
                bpe_vocab = os.path.join(tmp_dir, vocab_name)
                with tf.gfile.Open(bpe_vocab) as f:
                    vocab_list = f.read().split("\n")
                vocab_list.append(self.oov_token)
                text_encoder.TokenTextEncoder(
                    None, vocab_list=vocab_list).store_to_file(token_path)

        tag = 'eval'
        if dataset_split == problem.DatasetSplit.TRAIN:
            tag = 'train'

        fn_inputs = os.path.join(tmp_dir, "inputs.{}.txt".format(tag))
        fn_targets = os.path.join(tmp_dir, "targets.{}.txt".format(tag))

        return text_problems.text2text_txt_iterator(fn_inputs, fn_targets)
コード例 #6
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
   train = dataset_split == problem.DatasetSplit.TRAIN
   train_dataset = self.get_training_dataset(tmp_dir)
   datasets = train_dataset if train else _NC_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
   target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir,
       tmp_dir,
       self.source_vocab_name,
       self.approx_vocab_size,
       source_datasets,
       file_byte_budget=1e8)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir,
       tmp_dir,
       self.target_vocab_name,
       self.approx_vocab_size,
       target_datasets,
       file_byte_budget=1e8)
   tag = "train" if train else "dev"
   filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
   data_path = translate.compile_data(tmp_dir, datasets, filename_base)
   return text_problems.text2text_generate_encoded(
       text_problems.text2text_txt_iterator(data_path + ".lang1",
                                            data_path + ".lang2"),
       source_vocab, target_vocab)
コード例 #7
0
def compile_data(tmp_dir, datasets, filename):
    """Concatenate all `datasets` and save to `filename`."""
    filename = os.path.join(tmp_dir, filename)
    lang1_fname = filename + ".en"
    lang2_fname = filename + ".sv"
    if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
        tf.logging.info("Skipping compile data, found files:\n%s\n%s",
                        lang1_fname, lang2_fname)

    with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
        with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
            for dataset in datasets:
                path = dataset[0]

                lang1_filename, lang2_filename = dataset[1]
                lang1_filepath = os.path.join(path, lang1_filename)
                lang2_filepath = os.path.join(path, lang2_filename)

                is_sgm = (lang1_filename.endswith("sgm")
                          and lang2_filename.endswith("sgm"))

                for example in text_problems.text2text_txt_iterator(
                        lang1_filepath, lang2_filepath):
                    line1res = _preprocess_sgm(example["inputs"], is_sgm)
                    line2res = _preprocess_sgm(example["targets"], is_sgm)
                    if line1res and line2res:
                        lang1_resfile.write(line1res)
                        lang1_resfile.write("\n")
                        lang2_resfile.write(line2res)
                        lang2_resfile.write("\n")

    return filename
コード例 #8
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        if dataset_split == problem.DatasetSplit.TRAIN:
            source_file_name = self.compile_corpus_files(
                data_dir, SOURCE_TRAIN_FILES,
                self.source_compiled_corpus_filename[0])
            target_file_name = self.compile_corpus_files(
                data_dir, TARGET_TRAIN_FILES,
                self.target_compiled_corpus_filename[0])
        elif dataset_split == problem.DatasetSplit.EVAL:
            source_file_name = self.compile_corpus_files(
                data_dir, SOURCE_DEV_FILES,
                self.source_compiled_corpus_filename[1])
            target_file_name = self.compile_corpus_files(
                data_dir, TARGET_DEV_FILES,
                self.target_compiled_corpus_filename[1])
        elif dataset_split == problem.DatasetSplit.TEST:
            source_file_name = self.compile_corpus_files(
                data_dir, SOURCE_TEST_FILES,
                self.source_compiled_corpus_filename[2])
            target_file_name = self.compile_corpus_files(
                data_dir, TARGET_TEST_FILES,
                self.target_compiled_corpus_filename[2])

        return text_problems.text2text_txt_iterator(
            os.path.join(data_dir, source_file_name),
            os.path.join(data_dir, target_file_name))
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = train_dataset if train else _NC_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
     source_vocab = my_spm_utils.get_or_generate_spm(
         data_dir,
         tmp_dir,
         vocab_size=self.approx_vocab_size,
         model_prefix=self.source_vocab_name,
         sources=source_datasets,
         file_byte_budget=1e10)
     target_vocab = my_spm_utils.get_or_generate_spm(
         data_dir,
         tmp_dir,
         vocab_size=int(self.approx_vocab_size / 2),
         model_prefix=self.target_vocab_name,
         sources=target_datasets,
         file_byte_budget=1e10)
     tag = "train" if train else "dev"
     filename_base = "wmt_enja_%sk_tok_%s" % (self.approx_vocab_size, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + ".lang1",
                                              data_path + ".lang2"),
         source_vocab, target_vocab)
コード例 #10
0
ファイル: wsj_parsing.py プロジェクト: yufengm/tensor2tensor
 def generate_text_for_vocab(self, data_dir, tmp_dir):
     files = [os.path.join(tmp_dir, f) for f in self.TRAIN_FILES]
     inputs_file, targets_file = files
     for sample in text_problems.text2text_txt_iterator(
             inputs_file, targets_file):
         yield sample["inputs"]
         yield sample["targets"]
コード例 #11
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):

        train = dataset_split == problem.DatasetSplit.TRAIN

        source_datasets = _INPUT_FILES
        target_datasets = _OUTPUT_FILES

        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            source_datasets,
            file_byte_budget=1e8)

        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            target_datasets,
            file_byte_budget=1e8)

        tag = "train" if train else "test"

        filename_src = "en_{}.src".format(tag)
        filename_dst = "ru_{}.dst".format(tag)

        data_path = './shad_nlp18_contextNMT/data_fused/'

        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + filename_src,
                                                 data_path + filename_dst),
            source_vocab, target_vocab)
コード例 #12
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        datasets = self.source_data_files(dataset_split)
        tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"

        # create shared vocabulary
        if self.vocab_type == "subwords":
            data_path = translate.compile_data(
                tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag))
            self.get_or_create_vocab(data_dir, tmp_dir)
            sample_iterator = text_problems.text2text_txt_iterator(
                data_path + ".lang1", data_path + ".lang2")
        elif self.vocab_type == "tokens":
            sample_iterator = super().generate_samples(data_dir, tmp_dir,
                                                       dataset_split)
        else:
            raise ValueError("VocabType not supported")

        # create source feature vocabularies
        data_path = self.compile_sfeat_data(
            tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag))
        self.create_src_feature_vocabs(data_dir, tmp_dir)
        sfeat_iterator = text_problems.txt_line_iterator(data_path + ".sfeat")

        def _generate(sample_iterator, sfeat_iterator):
            for sample in sample_iterator:
                sample["sfeats"] = next(sfeat_iterator)
                yield sample

        return _generate(sample_iterator, sfeat_iterator)
コード例 #13
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        """Instance of token generator for the WMT en->de task, training set."""
        dataset_path = ("num2text-p8-v7/num2text")
        train_path = _get_num2text_dataset(tmp_dir, dataset_path)

        return text_problems.text2text_txt_iterator(
            train_path + "_num_p8_v7.txt", train_path + "_txt_p8_v7.txt")
コード例 #14
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_dataset = self.get_training_dataset(tmp_dir)
        datasets = train_dataset  #if train else _NC_TEST_DATASETS
        source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
        target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            source_datasets,
            file_byte_budget=1e8)
        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            target_datasets,
            file_byte_budget=1e8)
        tag = "train" if train else "dev"

        filename_base = "%s" % (tag)

        data_path = translate.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".source",
                                                 data_path + ".target"),
            source_vocab, target_vocab)
コード例 #15
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
   train = dataset_split == problem.DatasetSplit.TRAIN
   datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_encs_chr_%s" % tag)
   return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                               data_path + ".lang2")
コード例 #16
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        is_train_dataset = dataset_split == problem.DatasetSplit.TRAIN
        dataset_label = 'train' if is_train_dataset else 'dev'
        ext = '.txt'
        he_path = os.path.join(data_dir, 'he.'+dataset_label+ext)
        en_path = os.path.join(data_dir, 'en.'+dataset_label+ext)

        return text_problems.text2text_txt_iterator(he_path, en_path)
コード例 #17
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "wmt_encs_chr_%s" % tag)
     return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2")
コード例 #18
0
 def testText2TextTxtIterator(self):
   inputs = []
   targets = []
   for entry in text_problems.text2text_txt_iterator(self.inputs_file,
                                                     self.targets_file):
     inputs.append(entry["inputs"])
     targets.append(entry["targets"])
   self.assertEqual(inputs, self.inputs)
   self.assertEqual(targets, self.targets)
コード例 #19
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        training_source_file = os.path.join(config.DATA_DIR,
                                            'training_source.txt')
        training_target_file = os.path.join(config.DATA_DIR,
                                            'training_target.txt')
        dev_source_file = os.path.join(config.DATA_DIR, 'dev_source.txt')
        dev_target_file = os.path.join(config.DATA_DIR, 'dev_target.txt')

        train = dataset_split == problem.DatasetSplit.TRAIN
        source_file = (training_source_file if train else dev_source_file)
        target_file = (training_target_file if train else dev_target_file)

        # def generator_samples_content(get_source, get_target):
        #     source, target = None, None
        #
        #     with tf.gfile.GFile(source_file, mode='r') as f_x_train, \
        #             tf.gfile.GFile(target_file, mode='r') as f_y_train:
        #
        #         mrs = f_x_train.read().splitlines()
        #         utterances = f_y_train.read().splitlines()
        #
        #         for mr, utt in zip(mrs, utterances):
        #             yield mr, utt
        #
        # def generator_source():
        #     for source, _ in generator_samples_content(False, True):
        #         yield source.strip()
        #
        # def generator_target():
        #     for _, target in generator_samples_content(False, True):
        #         yield target.strip()
        #
        # # Generate vocab for both source and target
        # source_vocab = generator_utils.get_or_generate_vocab_inner(
        #     data_dir=data_dir,
        #     vocab_filename=self.vocab_input_filename,
        #     vocab_size=self.input_vocab_size,
        #     generator=generator_source())
        #
        # target_vocab = generator_utils.get_or_generate_vocab_inner(
        #     data_dir=data_dir,
        #     vocab_filename=self.vocab_target_filename,
        #     vocab_size=self.target_vocab_size,
        #     generator=generator_target())
        #
        # with io.open('data/training_source.txt', 'r', encoding='utf8') as f_x_train, \
        #         io.open('data/training_target.txt', 'r', encoding='utf8') as f_y_train:
        #     mrs = f_x_train.read().splitlines()
        #     utterances = f_y_train.read().splitlines()
        #
        #     for mr, utt in zip(mrs, utterances):
        #         yield {
        #             'inputs': mr,
        #             'targets': utt
        #         }

        return text_problems.text2text_txt_iterator(source_file, target_file)
コード例 #20
0
 def testText2TextTxtIterator(self):
     inputs = []
     targets = []
     for entry in text_problems.text2text_txt_iterator(
             self.inputs_file, self.targets_file):
         inputs.append(entry["inputs"])
         targets.append(entry["targets"])
     self.assertEqual(inputs, self.inputs)
     self.assertEqual(targets, self.targets)
コード例 #21
0
 def generate_text_for_vocab(self, data_dir, tmp_dir):
   files = [os.path.join(tmp_dir, f) for f in self.TRAIN_FILES]
   inputs_file, targets_file = files
   for i, sample in enumerate(text_problems.text2text_txt_iterator(inputs_file,
                                                                   targets_file
                                                                  )):
     yield sample["inputs"]
     yield sample["targets"]
     if self.max_samples_for_vocab and (i + 1) >= self.max_samples_for_vocab:
       break
コード例 #22
0
    def generate_samples(self, data_dir, _tmp_dir, dataset_split):  #pylint: disable=no-self-use
        """This method returns the generator to return {"inputs": [text], "targets": [text]} dict"""

        functions_file_path = os.path.join(data_dir,
                                           '{}.function'.format(dataset_split))
        docstrings_file_path = os.path.join(
            data_dir, '{}.docstring'.format(dataset_split))

        return text_problems.text2text_txt_iterator(functions_file_path,
                                                    docstrings_file_path)
コード例 #23
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        """Instance of token generator for the WMT en->de task, training set."""

        if dataset_split == problem.DatasetSplit.TRAIN:
            dataset_path = OUT_SENTS
        else:
            dataset_path = OUT_TEST

        return text_problems.text2text_txt_iterator(dataset_path + "en",
                                                    dataset_path + "fr")
コード例 #24
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Returns the generator of {"inputs": [text], "targets": [text]} dict."""

    functions_file_path = os.path.join(
        data_dir, '{}.function'.format(dataset_split))
    docstrings_file_path = os.path.join(
        data_dir, '{}.docstring'.format(dataset_split))

    return text_problems.text2text_txt_iterator(
        functions_file_path, docstrings_file_path)
コード例 #25
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        """Returns the generator of {"inputs": [text], "targets": [text]} dict."""

        functions_file_path = os.path.join(data_dir,
                                           '{}.function'.format(dataset_split))
        docstrings_file_path = os.path.join(
            data_dir, '{}.docstring'.format(dataset_split))

        return text_problems.text2text_txt_iterator(functions_file_path,
                                                    docstrings_file_path)
コード例 #26
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        is_train_dataset = dataset_split == problem.DatasetSplit.TRAIN
        ext = '.txt'
        dataset_label = os.getenv(
            'TRAIN_NAME') if is_train_dataset else os.getenv('DEV_NAME')
        original_data_dir = os.getenv('DATA_DIR')

        he_path = os.path.join(original_data_dir, 'he.' + dataset_label + ext)
        en_path = os.path.join(original_data_dir, 'en.' + dataset_label + ext)

        return text_problems.text2text_txt_iterator(he_path, en_path)
コード例 #27
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        training_source_file = os.path.join(config.DATA_DIR, 'training_source.txt')
        training_target_file = os.path.join(config.DATA_DIR, 'training_target.txt')
        dev_source_file = os.path.join(config.DATA_DIR, 'dev_source.txt')
        dev_target_file = os.path.join(config.DATA_DIR, 'dev_target.txt')

        train = dataset_split == problem.DatasetSplit.TRAIN
        source_file = (training_source_file if train else dev_source_file)
        target_file = (training_target_file if train else dev_target_file)

        return text_problems.text2text_txt_iterator(source_file, target_file)
コード例 #28
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
   datasets = self.source_data_files(dataset_split)
   tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
   data_path = translate.compile_data(
       tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag))
   # For eval, use authentic data.
   if dataset_split != problem.DatasetSplit.TRAIN:
     for example in text_problems.text2text_txt_iterator(
         data_path + ".lang1", data_path + ".lang2"):
       yield example
   else:  # For training, mix synthetic and authentic data as follows.
     for (file1, file2) in self.backtranslate_data_filenames:
       path1 = os.path.join(tmp_dir, file1)
       path2 = os.path.join(tmp_dir, file2)
       # Synthetic data first.
       for example in text_problems.text2text_txt_iterator(path1, path2):
         yield example
       # Now authentic data.
       for example in text_problems.text2text_txt_iterator(
           data_path + ".lang1", data_path + ".lang2"):
         yield example
コード例 #29
0
  def testCompileData(self):
    filename = "out"
    filepath = os.path.join(self.tmp_dir, filename)
    translate.compile_data(self.tmp_dir, self.DATASETS, filename)

    count = 0
    for i, example in enumerate(
        text_problems.text2text_txt_iterator(filepath + ".lang1",
                                             filepath + ".lang2")):
      expected = self.data[i]
      self.assertEqual(list(expected), [example["inputs"], example["targets"]])
      count += 1
    self.assertEqual(count, len(self.data))
コード例 #30
0
ファイル: translate.py プロジェクト: yuxiazff/tensor2tensor
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    datasets = self.source_data_files(dataset_split)
    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
    data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name,
                                                                    tag))

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                data_path + ".lang2")
コード例 #31
0
ファイル: translate.py プロジェクト: kltony/tensor2tensor
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    datasets = self.source_data_files(dataset_split)
    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
    data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name,
                                                                    tag))

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                data_path + ".lang2")
コード例 #32
0
  def testCompileData(self):
    filename = "out"
    filepath = os.path.join(self.tmp_dir, filename)
    translate.compile_data(self.tmp_dir, self.DATASETS, filename)

    count = 0
    for i, example in enumerate(
        text_problems.text2text_txt_iterator(filepath + ".lang1",
                                             filepath + ".lang2")):
      expected = self.data[i]
      self.assertEqual(list(expected), [example["inputs"], example["targets"]])
      count += 1
    self.assertEqual(count, len(self.data))
コード例 #33
0
ファイル: merge_vocab.py プロジェクト: ultimatedaotu/ai-edu
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        datasets = _TRAIN_DATASETS if train else _DEV_DATASETS

        vocab_list = []
        print("=======Get Vocab from ", self.vocab_name, '...', end='')
        with open(self.vocab_name, 'r', encoding='utf-8') as f:
            vocab_list = f.read().splitlines()
        print("=======Done")

        vocab = text_encoder.TokenTextEncoder(vocab_filename=None, vocab_list=vocab_list, replace_oov="<UNK>", num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS)

        return text_problems.text2text_generate_encoded(text_problems.text2text_txt_iterator(datasets[0], datasets[1]), vocab, vocab)
コード例 #34
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
   datasets = self.source_data_files(dataset_split)
   tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
   data_path = translate.compile_data(
       tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag))
   # Iterator over authentic data.
   it_auth = text_problems.text2text_txt_iterator(
       data_path + ".lang1", data_path + ".lang2")
   # For eval, use authentic data.
   if dataset_split != problem.DatasetSplit.TRAIN:
     for example in it_auth:
       yield example
   else:  # For training, mix synthetic and authentic data as follows.
     for (file1, file2) in self.backtranslate_data_filenames:
       path1 = os.path.join(tmp_dir, file1)
       path2 = os.path.join(tmp_dir, file2)
       # Synthetic data first.
       for example in text_problems.text2text_txt_iterator(path1, path2):
         yield example
       # Now authentic data.
       for example in it_auth:
         yield example
コード例 #35
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_dataset = self.get_training_dataset(tmp_dir)
        datasets = train_dataset if train else _NC_TEST_DATASETS

        source_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.source_vocab_name),
                                                     replace_oov=self.oov_token)
        target_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.target_vocab_name),
                                                     replace_oov=self.oov_token)
        tag = "train" if train else "dev"
        filename_base = "wmt_pdre_tok_%s" % tag
        data_path = translate.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2"),
            source_vocab, target_vocab)
コード例 #36
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_path = os.path.join(
            tmp_dir, "oversampled",
            GEC_DATASETS["TRAIN"]) if train else os.path.join(
                tmp_dir, "wi_locness", GEC_DATASETS["DEV"])
        vocab_path = os.path.join(tmp_dir, GEC_DATASETS["VOCAB"])
        vocab_path_src_tgt = os.path.join(data_dir, GEC_DATASETS["VOCAB"])

        if not tf.gfile.Exists(vocab_path_src_tgt):
            tf.gfile.Copy(vocab_path, vocab_path_src_tgt)
            with tf.gfile.GFile(vocab_path_src_tgt, mode="r") as fr:
                vocab_data = "<pad>\n<EOS>\n" + fr.read()
                with tf.gfile.GFile(vocab_path_src_tgt, mode="w") as fw:
                    fw.write(vocab_data)
        return text_problems.text2text_txt_iterator(train_path + ".src",
                                                    train_path + ".tgt")
コード例 #37
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.source_vocab_name,
         self.approx_vocab_size,
         _ZHZH_TRAIN_DATASETS,
         file_byte_budget=1e8)
     train = dataset_split == problem.DatasetSplit.TRAIN
     datasets = _ZHZH_TRAIN_DATASETS if train else _ZHZH_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "mydata_enzh_tok_%s" % tag)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + ".lang1",
                                              data_path + ".lang2"),
         symbolizer_vocab, symbolizer_vocab)
コード例 #38
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        """Instance of token generator for the WMT en->de task, training set."""
        train = dataset_split == problem.DatasetSplit.TRAIN
        dataset_path = ("train.tok.clean.bpe.32000"
                        if train else "newstest2013.tok.bpe.32000")
        train_path = _get_wmt_ende_bpe_dataset(tmp_dir, dataset_path)

        # Vocab
        vocab_path = os.path.join(data_dir, self.vocab_filename)
        if not tf.gfile.Exists(vocab_path):
            bpe_vocab = os.path.join(tmp_dir, "vocab.bpe.32000")
            with tf.gfile.Open(bpe_vocab) as f:
                vocab_list = f.read().split("\n")
            vocab_list.append(self.oov_token)
            text_encoder.TokenTextEncoder(
                None, vocab_list=vocab_list).store_to_file(vocab_path)

        return text_problems.text2text_txt_iterator(train_path + ".en",
                                                    train_path + ".de")
コード例 #39
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Instance of token generator for the WMT en->de task, training set."""
    train = dataset_split == problem.DatasetSplit.TRAIN
    dataset_path = ("train.tok.clean.bpe.32000"
                    if train else "newstest2013.tok.bpe.32000")
    train_path = _get_wmt_ende_bpe_dataset(tmp_dir, dataset_path)

    # Vocab
    vocab_path = os.path.join(data_dir, self.vocab_filename)
    if not tf.gfile.Exists(vocab_path):
      bpe_vocab = os.path.join(tmp_dir, "vocab.bpe.32000")
      with tf.gfile.Open(bpe_vocab) as f:
        vocab_list = f.read().split("\n")
      vocab_list.append(self.oov_token)
      text_encoder.TokenTextEncoder(
          None, vocab_list=vocab_list).store_to_file(vocab_path)

    return text_problems.text2text_txt_iterator(train_path + ".en",
                                                train_path + ".de")
コード例 #40
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file)
コード例 #41
0
ファイル: translate.py プロジェクト: qixiuai/tensor2tensor
def compile_data(tmp_dir, datasets, filename):
  """Concatenate all `datasets` and save to `filename`."""
  filename = os.path.join(tmp_dir, filename)
  lang1_fname = filename + ".lang1"
  lang2_fname = filename + ".lang2"
  if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
    tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname,
                    lang2_fname)
    return filename
  with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
    with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
      for dataset in datasets:
        url = dataset[0]
        compressed_filename = os.path.basename(url)
        compressed_filepath = os.path.join(tmp_dir, compressed_filename)
        if url.startswith("http"):
          generator_utils.maybe_download(tmp_dir, compressed_filename, url)

        if dataset[1][0] == "tsv":
          _, src_column, trg_column, glob_pattern = dataset[1]
          filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          if not filenames:
            # Capture *.tgz and *.tar.gz too.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
            filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          for tsv_filename in filenames:
            if tsv_filename.endswith(".gz"):
              new_filename = tsv_filename.strip(".gz")
              generator_utils.gunzip_file(tsv_filename, new_filename)
              tsv_filename = new_filename
            with tf.gfile.Open(tsv_filename) as tsv_file:
              for line in tsv_file:
                if line and "\t" in line:
                  parts = line.split("\t")
                  source, target = parts[src_column], parts[trg_column]
                  source, target = source.strip(), target.strip()
                  if source and target:
                    lang1_resfile.write(source)
                    lang1_resfile.write("\n")
                    lang2_resfile.write(target)
                    lang2_resfile.write("\n")
        else:
          lang1_filename, lang2_filename = dataset[1]
          lang1_filepath = os.path.join(tmp_dir, lang1_filename)
          lang2_filepath = os.path.join(tmp_dir, lang2_filename)
          is_sgm = (
              lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm"))

          if not (tf.gfile.Exists(lang1_filepath) and
                  tf.gfile.Exists(lang2_filepath)):
            # For .tar.gz and .tgz files, we read compressed.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
          if lang1_filepath.endswith(".gz"):
            new_filepath = lang1_filepath.strip(".gz")
            generator_utils.gunzip_file(lang1_filepath, new_filepath)
            lang1_filepath = new_filepath
          if lang2_filepath.endswith(".gz"):
            new_filepath = lang2_filepath.strip(".gz")
            generator_utils.gunzip_file(lang2_filepath, new_filepath)
            lang2_filepath = new_filepath

          for example in text_problems.text2text_txt_iterator(
              lang1_filepath, lang2_filepath):
            line1res = _preprocess_sgm(example["inputs"], is_sgm)
            line2res = _preprocess_sgm(example["targets"], is_sgm)
            if line1res and line2res:
              lang1_resfile.write(line1res)
              lang1_resfile.write("\n")
              lang2_resfile.write(line2res)
              lang2_resfile.write("\n")

  return filename