def generate_vocab(self, data_dir, tmp_dir, **kwargs):
        datasets = get_dataset(tmp_dir)
        source_datasets = [[item[0], [item[1][0]]] for item in datasets]
        target_datasets = [[item[0], [item[1][1]]] for item in datasets]
        tf.gfile.MkDir(data_dir)
        for each in source_datasets:
            print("src_file:{file:s}".format(file=str(each)))

        for each in target_datasets:
            print("target_file:{file:s}".format(file=str(each)))

        source_vocab_generator = \
            generator_utils.generate_lines_for_vocab(tmp_dir, source_datasets, file_byte_budget=1e10)
        target_vocab_generator = \
            generator_utils.generate_lines_for_vocab(tmp_dir, target_datasets, file_byte_budget=1e10)

        if tf.gfile.Exists(
                os.path.join(
                    tmp_dir, "{prefix:s}.corpus.txt".format(
                        prefix=self.source_vocab_name))):
            print("====Source/Target File Exists====")
        else:
            count = 0
            with tf.gfile.Open(
                    os.path.join(
                        tmp_dir, "{prefix:s}.corpus.txt".format(
                            prefix=self.source_vocab_name)), "w") as f:
                for src, tgt in zip(source_vocab_generator,
                                    target_vocab_generator):
                    f.write(src)
                    f.write("\n")
                    f.write(tgt)
                    f.write("\n")
                    count += 1
                print("====src/tgt:{src:d}=====".format(src=count))

        _ = SpmTextEncoder.build_from_file(
            output_dir=data_dir,
            filename=os.path.join(
                tmp_dir,
                "{prefix:s}.corpus.txt".format(prefix=self.source_vocab_name)),
            vocab_size=self.approx_vocab_size,
            model_prefix=self.source_vocab_name,
            reserved_tokens=kwargs['reserved_tokens'],
            model_type=kwargs["model_type"],
            sentence_size=40000000)

        model_src_path = os.path.join(data_dir,
                                      self.source_vocab_name + '.model')
        vocab_src_path = os.path.join(data_dir,
                                      self.source_vocab_name + '.vocab')

        model_tgt_path = os.path.join(data_dir,
                                      self.target_vocab_name + '.model')
        vocab_tgt_path = os.path.join(data_dir,
                                      self.target_vocab_name + '.vocab')

        shutil.copy(model_src_path, model_tgt_path)
        shutil.copy(vocab_src_path, vocab_tgt_path)
Esempio n. 2
0
def get_or_generate_spm(data_dir,
                        tmp_dir,
                        vocab_size,
                        model_prefix,
                        sources,
                        file_byte_budget=1e6,
                        model_type="bpe",
                        reserved_tokens=None):
    """Generate a vocabulary from the datasets in sources."""
    vocab_generator = generate_lines_for_vocab(tmp_dir, sources, file_byte_budget)
    return get_or_generate_spm_inner(tmp_dir,
                                     data_dir,
                                     vocab_size,
                                     model_prefix,
                                     vocab_generator,
                                     model_type,
                                     reserved_tokens)
Esempio n. 3
0
 def generate_text_for_vocab(self, data_dir, tmp_dir):
     return generator_utils.generate_lines_for_vocab(
         tmp_dir, self.vocab_data_files())
Esempio n. 4
0
 def generate_text_for_vocab(self, data_dir, tmp_dir):
   return generator_utils.generate_lines_for_vocab(tmp_dir,
                                                   self.vocab_data_files())
    def generate_vocab(self, data_dir, tmp_dir, **kwargs):
        datasets = get_dataset(tmp_dir)
        source_datasets = [[item[0], [item[1][0]]] for item in datasets]
        target_datasets = [[item[0], [item[1][1]]] for item in datasets]
        tf.gfile.MkDir(data_dir)
        for each in source_datasets:
            print("src_file:{file:s}".format(file=str(each)))

        for each in target_datasets:
            print("target_file:{file:s}".format(file=str(each)))

        source_vocab_generator = \
            generator_utils.generate_lines_for_vocab(tmp_dir, source_datasets, file_byte_budget=1e10)
        target_vocab_generator = \
            generator_utils.generate_lines_for_vocab(tmp_dir, target_datasets, file_byte_budget=1e10)

        if tf.gfile.Exists(
                os.path.join(
                    tmp_dir, "{prefix:s}.corpus.txt".format(
                        prefix=self.source_vocab_name))):
            print("====Source File Exists====")
        else:
            count = 0
            with tf.gfile.Open(
                    os.path.join(
                        tmp_dir, "{prefix:s}.corpus.txt".format(
                            prefix=self.source_vocab_name)), "w") as f:
                for line in source_vocab_generator:
                    f.write(line)
                    f.write("\n")
                    count += 1
                print("====src:{src:d}=====".format(src=count))

        if tf.gfile.Exists(
                os.path.join(
                    tmp_dir, "{prefix:s}.corpus.txt".format(
                        prefix=self.target_vocab_name))):
            print("====Source File Exists====")
        else:
            count = 0
            with tf.gfile.Open(
                    os.path.join(
                        tmp_dir, "{prefix:s}.corpus.txt".format(
                            prefix=self.target_vocab_name)), "w") as f:
                for line in target_vocab_generator:
                    f.write(line)
                    f.write("\n")
                    count += 1
                print("====target:{target:d}=====".format(target=count))

        _ = SpmTextEncoder.build_from_file(
            output_dir=data_dir,
            filename=os.path.join(
                tmp_dir,
                "{prefix:s}.corpus.txt".format(prefix=self.source_vocab_name)),
            vocab_size=self.approx_vocab_size,
            model_prefix=self.source_vocab_name,
            reserved_tokens=kwargs['reserved_tokens'],
            model_type=kwargs["model_type"])

        _ = SpmTextEncoder.build_from_file(
            output_dir=data_dir,
            filename=os.path.join(
                tmp_dir,
                "{prefix:s}.corpus.txt".format(prefix=self.target_vocab_name)),
            vocab_size=int(self.approx_vocab_size / 2),
            model_prefix=self.target_vocab_name,
            reserved_tokens=kwargs['reserved_tokens'],
            model_type=kwargs["model_type"])