Esempio n. 1
0
def tabbed_parsing_character_generator(tmp_dir, train):
  """Generate source and target data from a single file."""
  character_vocab = text_encoder.ByteTextEncoder()
  filename = "parsing_{0}.pairs".format("train" if train else "dev")
  pair_filepath = os.path.join(tmp_dir, filename)
  return text_problems.text2text_generate_encoded(
      text_problems.text2text_txt_tab_iterator(pair_filepath), character_vocab)
Esempio n. 2
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):

        train = dataset_split == problem.DatasetSplit.TRAIN

        source_datasets = _INPUT_FILES
        target_datasets = _OUTPUT_FILES

        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            source_datasets,
            file_byte_budget=1e8)

        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            target_datasets,
            file_byte_budget=1e8)

        tag = "train" if train else "test"

        filename_src = "en_{}.src".format(tag)
        filename_dst = "ru_{}.dst".format(tag)

        data_path = './shad_nlp18_contextNMT/data_fused/'

        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + filename_src,
                                                 data_path + filename_dst),
            source_vocab, target_vocab)
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = self.source_data_files(dataset_split)
     """
     for item in datasets:
         dummy_file_name = item[0].split('/')[-1]
         create_dummy_tar(tmp_dir, dummy_file_name)
         s_file, t_file = item[1][0], item[1][1]
         if not os.path.exists(os.path.join(tmp_dir, s_file)):
             raise Exception("Be sure file '%s' is exists in tmp dir" % s_file)
         if not os.path.exists(os.path.join(tmp_dir, t_file)):
             raise Exception("Be sure file '%s' is exists in tmp dir" % t_file)
     """
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     source_vocab_filename = os.path.join(data_dir, self.source_vocab_name)
     target_vocab_filename = os.path.join(data_dir, self.target_vocab_name)
     source_encoder = text_encoder.TokenTextEncoder(
         source_vocab_filename, replace_oov=self.oov_token)
     target_encoder = text_encoder.TokenTextEncoder(
         target_vocab_filename, replace_oov=self.oov_token)
     tag = "train" if train else "dev"
     filename_base = "%s-compiled-%s" % (self.name, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + '.lang1',
                                              data_path + '.lang2'),
         source_encoder, target_encoder)
Esempio n. 4
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     """在生成数据的时候,主要是通过这个方法获取已编码样本的"""
     generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
     encoder = self.get_vocab(data_dir)
     target_encoder = self.get_vocab(data_dir, is_target=True)
     return text_problems.text2text_generate_encoded(generator, encoder, target_encoder,
                                                     has_inputs=self.has_inputs)
Esempio n. 5
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
   train = dataset_split == problem.DatasetSplit.TRAIN
   train_dataset = self.get_training_dataset(tmp_dir)
   datasets = train_dataset if train else _NC_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
   target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir,
       tmp_dir,
       self.source_vocab_name,
       self.approx_vocab_size,
       source_datasets,
       file_byte_budget=1e8)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir,
       tmp_dir,
       self.target_vocab_name,
       self.approx_vocab_size,
       target_datasets,
       file_byte_budget=1e8)
   tag = "train" if train else "dev"
   filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
   data_path = translate.compile_data(tmp_dir, datasets, filename_base)
   return text_problems.text2text_generate_encoded(
       text_problems.text2text_txt_iterator(data_path + ".lang1",
                                            data_path + ".lang2"),
       source_vocab, target_vocab)
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
     encoders = self.get_or_create_vocab(data_dir, tmp_dir)
     return text_problems.text2text_generate_encoded(sample_generator=generator,
                                                     vocab=encoders["inputs"],
                                                     targets_vocab=encoders["targets"],
                                                     has_inputs=self.has_inputs)
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = train_dataset if train else _NC_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
     source_vocab = my_spm_utils.get_or_generate_spm(
         data_dir,
         tmp_dir,
         vocab_size=self.approx_vocab_size,
         model_prefix=self.source_vocab_name,
         sources=source_datasets,
         file_byte_budget=1e10)
     target_vocab = my_spm_utils.get_or_generate_spm(
         data_dir,
         tmp_dir,
         vocab_size=int(self.approx_vocab_size / 2),
         model_prefix=self.target_vocab_name,
         sources=target_datasets,
         file_byte_budget=1e10)
     tag = "train" if train else "dev"
     filename_base = "wmt_enja_%sk_tok_%s" % (self.approx_vocab_size, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + ".lang1",
                                              data_path + ".lang2"),
         source_vocab, target_vocab)
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_dataset = self.get_training_dataset(tmp_dir)
        datasets = train_dataset  #if train else _NC_TEST_DATASETS
        source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
        target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            source_datasets,
            file_byte_budget=1e8)
        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            target_datasets,
            file_byte_budget=1e8)
        tag = "train" if train else "dev"

        filename_base = "%s" % (tag)

        data_path = translate.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".source",
                                                 data_path + ".target"),
            source_vocab, target_vocab)
Esempio n. 9
0
def tabbed_parsing_character_generator(tmp_dir, train):
  """Generate source and target data from a single file."""
  character_vocab = text_encoder.ByteTextEncoder()
  filename = "parsing_{0}.pairs".format("train" if train else "dev")
  pair_filepath = os.path.join(tmp_dir, filename)
  return text_problems.text2text_generate_encoded(
      text_problems.text2text_txt_tab_iterator(pair_filepath), character_vocab)
Esempio n. 10
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
        source_encoder = self.feature_encoders(data_dir)['inputs']
        target_encoder = self.feature_encoders(data_dir)['targets']

        return text_problems.text2text_generate_encoded(
            generator,
            source_encoder,
            targets_vocab=target_encoder,
            has_inputs=True)
Esempio n. 11
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     """
     override this function just for add "replace_oov = '<UNK>'
     """
     generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
     vocab_filename = os.path.join(data_dir, self.vocab_filename)
     encoder = text_encoder.TokenTextEncoder(vocab_filename,
                                             replace_oov='<UNK>')
     return text2text_generate_encoded(generator,
                                       encoder,
                                       has_inputs=self.has_inputs)
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     """在生成数据的时候,主要是通过这个方法获取已编码样本的
     args:data_dir:存储t2tdatagen产生的数据
           temp_dir: 原始数据存放地址
     """
     generator = self.generate_samples(data_dir, tmp_dir,
                                       dataset_split)  # 一个生成器每次产生一个句子对
     encoder = self.get_vocab(data_dir)
     target_encoder = self.get_vocab(data_dir, is_target=True)
     # 将句子编码, has_input=True表示对源语言数据编码,=False表示对目标语言数据编码
     return text_problems.text2text_generate_encoded(
         generator, encoder, target_encoder, has_inputs=self.has_inputs)
Esempio n. 13
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        datasets = _TRAIN_DATASETS if train else _DEV_DATASETS

        vocab_list = []
        print("=======Get Vocab from ", self.vocab_name, '...', end='')
        with open(self.vocab_name, 'r', encoding='utf-8') as f:
            vocab_list = f.read().splitlines()
        print("=======Done")

        vocab = text_encoder.TokenTextEncoder(vocab_filename=None, vocab_list=vocab_list, replace_oov="<UNK>", num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS)

        return text_problems.text2text_generate_encoded(text_problems.text2text_txt_iterator(datasets[0], datasets[1]), vocab, vocab)
Esempio n. 14
0
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
                                   source_vocab_size, target_vocab_size):
  """Generate source and target data from a single file."""
  filename = "parsing_{0}.pairs".format("train" if train else "dev")
  source_vocab = generator_utils.get_or_generate_tabbed_vocab(
      data_dir, tmp_dir, filename, 0,
      prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size)
  target_vocab = generator_utils.get_or_generate_tabbed_vocab(
      data_dir, tmp_dir, filename, 1,
      prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size)
  pair_filepath = os.path.join(tmp_dir, filename)
  return text_problems.text2text_generate_encoded(
      text_problems.text2text_txt_tab_iterator(pair_filepath), source_vocab,
      target_vocab)
Esempio n. 15
0
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
                                   source_vocab_size, target_vocab_size):
  """Generate source and target data from a single file."""
  filename = "parsing_{0}.pairs".format("train" if train else "dev")
  source_vocab = generator_utils.get_or_generate_tabbed_vocab(
      data_dir, tmp_dir, filename, 0,
      prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size)
  target_vocab = generator_utils.get_or_generate_tabbed_vocab(
      data_dir, tmp_dir, filename, 1,
      prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size)
  pair_filepath = os.path.join(tmp_dir, filename)
  return text_problems.text2text_generate_encoded(
      text_problems.text2text_txt_tab_iterator(pair_filepath), source_vocab,
      target_vocab)
Esempio n. 16
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_dataset = self.get_training_dataset(tmp_dir)
        datasets = train_dataset if train else _NC_TEST_DATASETS

        source_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.source_vocab_name),
                                                     replace_oov=self.oov_token)
        target_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.target_vocab_name),
                                                     replace_oov=self.oov_token)
        tag = "train" if train else "dev"
        filename_base = "wmt_pdre_tok_%s" % tag
        data_path = translate.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2"),
            source_vocab, target_vocab)
Esempio n. 17
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.source_vocab_name,
         self.approx_vocab_size,
         _ZHZH_TRAIN_DATASETS,
         file_byte_budget=1e8)
     train = dataset_split == problem.DatasetSplit.TRAIN
     datasets = _ZHZH_TRAIN_DATASETS if train else _ZHZH_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "mydata_enzh_tok_%s" % tag)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + ".lang1",
                                              data_path + ".lang2"),
         symbolizer_vocab, symbolizer_vocab)
Esempio n. 18
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_dataset = self.get_training_dataset(tmp_dir)
        datasets = train_dataset if train else _NC_TEST_DATASETS
        for item in datasets:
            dummy_file_name = item[0].split("/")[-1]
            create_dummy_tar(tmp_dir, dummy_file_name)
            s_file, t_file = item[1][0], item[1][1]
            if not os.path.exists(os.path.join(tmp_dir, s_file)):
                raise Exception("Be sure file '%s' is exists in tmp dir" %
                                s_file)
            if not os.path.exists(os.path.join(tmp_dir, t_file)):
                raise Exception("Be sure file '%s' is exists in tmp dir" %
                                t_file)

        source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
        target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]

        # 有词汇表直接编码,没有此汇表会自己创建词表;此处构建的是编码器,同时可以创建词表
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            source_datasets,
            file_byte_budget=1e8)

        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            target_datasets,
            file_byte_budget=1e8)

        tag = "train" if train else "dev"
        filename_base = "wmt_enzh_%sk_sub_%s" % (self.approx_vocab_size, tag)
        # 将所有语料连接存入一个文件中
        data_path = translate.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2"),
            source_vocab, target_vocab)
Esempio n. 19
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        # train_dataset = self.get_training_dataset(tmp_dir)
        if dataset_split == problem.DatasetSplit.TRAIN:
            datasets = LM_TRAIN_DATASETS
            tag = "train"
        elif dataset_split == problem.DatasetSplit.EVAL:
            datasets = LM_DEV_DATASETS
            tag = "dev"
        else:
            datasets = LM_TEST_DATASETS
            tag = "test"

        # train = dataset_split == problem.DatasetSplit.TRAIN

        # datasets = train_dataset if train else LM_TEST_DATASETS
        # source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
        # target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
        source_vocab = generator_utils.get_or_generate_vocab_inner(
            data_dir=data_dir,
            vocab_filename=self.source_vocab_name,
            vocab_size=self.approx_vocab_size,
            generator=self.generate(tmp_dir=tmp_dir,
                                    source_filenames=self.source_filenames,
                                    index=1),
            max_subtoken_length=None)
        target_vocab = generator_utils.get_or_generate_vocab_inner(
            data_dir=data_dir,
            vocab_filename=self.target_vocab_name,
            vocab_size=self.approx_vocab_size,
            generator=self.generate(tmp_dir=tmp_dir,
                                    source_filenames=self.source_filenames,
                                    index=2),
            max_subtoken_length=1)
        # tag = "train" if train else "dev"
        filename_base = "thchs_pinyinzh_%sk_tok_%s" % (self.approx_vocab_size,
                                                       tag)
        data_path = self.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2"),
            source_vocab, target_vocab)
Esempio n. 20
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        source_dataset = ['en_corpus']
        target_dataset = ['ch_corpus']
        source_vocab = get_or_generate_vocab(data_dir, self.source_vocab_name,
                                             self.approx_vocab_size,
                                             source_dataset)

        target_vocab = get_or_generate_vocab(data_dir, self.target_vocab_name,
                                             self.approx_vocab_size,
                                             target_dataset)

        tag = "train" if train else "dev"
        filename_base = "challenger_enzh_%sk_tok_%s" % (self.approx_vocab_size,
                                                        tag)

        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(
                os.path.join(data_dir, 'en_corpus'),
                os.path.join(data_dir, 'ch_corpus')), source_vocab,
            target_vocab)
Esempio n. 21
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
     encoder = self.get_or_create_vocab(data_dir, tmp_dir)
     return text_problems.text2text_generate_encoded(
         generator, encoder, has_inputs=self.has_inputs)
Esempio n. 22
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
   generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
   encoder = self.get_or_create_vocab(data_dir, tmp_dir)
   return text_problems.text2text_generate_encoded(generator, encoder,
                                                   has_inputs=self.has_inputs)
Esempio n. 23
0
                os.path.join(data_dir, 'ch_corpus')), source_vocab,
            target_vocab)

    def feature_encoders(self, data_dir):
        source_vocab_filename = os.path.join(data_dir, self.source_vocab_name)
        target_vocab_filename = os.path.join(data_dir, self.target_vocab_name)
        source_token = text_encoder.SubwordTextEncoder(source_vocab_filename)
        target_token = text_encoder.SubwordTextEncoder(target_vocab_filename)
        return {
            "inputs": source_token,
            "targets": target_token,
        }


if __name__ == '__main__':
    root_path = '/home/zhangpengpeng/PycharmProjects/challenger_mt/mt/data1'
    source_vocab = text_encoder.TokenTextEncoder(os.path.join(
        root_path, 'en_vocab'),
                                                 num_reserved_ids=2,
                                                 replace_oov=UNK)
    target_vocab = text_encoder.TokenTextEncoder(os.path.join(
        root_path, 'ch_vocab'),
                                                 num_reserved_ids=2,
                                                 replace_oov=UNK)
    encoded = text_problems.text2text_generate_encoded(
        text_problems.text2text_txt_iterator(
            os.path.join(root_path, 'en_corpus'),
            os.path.join(root_path, 'ch_corpus')), source_vocab, target_vocab)
    for _ in range(100):
        print(encoded())