def generate_vocab(self, data_dir, tmp_dir, **kwargs): datasets = get_dataset(tmp_dir) source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] tf.gfile.MkDir(data_dir) for each in source_datasets: print("src_file:{file:s}".format(file=str(each))) for each in target_datasets: print("target_file:{file:s}".format(file=str(each))) source_vocab_generator = \ generator_utils.generate_lines_for_vocab(tmp_dir, source_datasets, file_byte_budget=1e10) target_vocab_generator = \ generator_utils.generate_lines_for_vocab(tmp_dir, target_datasets, file_byte_budget=1e10) if tf.gfile.Exists( os.path.join( tmp_dir, "{prefix:s}.corpus.txt".format( prefix=self.source_vocab_name))): print("====Source/Target File Exists====") else: count = 0 with tf.gfile.Open( os.path.join( tmp_dir, "{prefix:s}.corpus.txt".format( prefix=self.source_vocab_name)), "w") as f: for src, tgt in zip(source_vocab_generator, target_vocab_generator): f.write(src) f.write("\n") f.write(tgt) f.write("\n") count += 1 print("====src/tgt:{src:d}=====".format(src=count)) _ = SpmTextEncoder.build_from_file( output_dir=data_dir, filename=os.path.join( tmp_dir, "{prefix:s}.corpus.txt".format(prefix=self.source_vocab_name)), vocab_size=self.approx_vocab_size, model_prefix=self.source_vocab_name, reserved_tokens=kwargs['reserved_tokens'], model_type=kwargs["model_type"], sentence_size=40000000) model_src_path = os.path.join(data_dir, self.source_vocab_name + '.model') vocab_src_path = os.path.join(data_dir, self.source_vocab_name + '.vocab') model_tgt_path = os.path.join(data_dir, self.target_vocab_name + '.model') vocab_tgt_path = os.path.join(data_dir, self.target_vocab_name + '.vocab') shutil.copy(model_src_path, model_tgt_path) shutil.copy(vocab_src_path, vocab_tgt_path)
def get_or_generate_spm(data_dir, tmp_dir, vocab_size, model_prefix, sources, file_byte_budget=1e6, model_type="bpe", reserved_tokens=None): """Generate a vocabulary from the datasets in sources.""" vocab_generator = generate_lines_for_vocab(tmp_dir, sources, file_byte_budget) return get_or_generate_spm_inner(tmp_dir, data_dir, vocab_size, model_prefix, vocab_generator, model_type, reserved_tokens)
def generate_text_for_vocab(self, data_dir, tmp_dir): return generator_utils.generate_lines_for_vocab( tmp_dir, self.vocab_data_files())
def generate_text_for_vocab(self, data_dir, tmp_dir): return generator_utils.generate_lines_for_vocab(tmp_dir, self.vocab_data_files())
def generate_vocab(self, data_dir, tmp_dir, **kwargs): datasets = get_dataset(tmp_dir) source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] tf.gfile.MkDir(data_dir) for each in source_datasets: print("src_file:{file:s}".format(file=str(each))) for each in target_datasets: print("target_file:{file:s}".format(file=str(each))) source_vocab_generator = \ generator_utils.generate_lines_for_vocab(tmp_dir, source_datasets, file_byte_budget=1e10) target_vocab_generator = \ generator_utils.generate_lines_for_vocab(tmp_dir, target_datasets, file_byte_budget=1e10) if tf.gfile.Exists( os.path.join( tmp_dir, "{prefix:s}.corpus.txt".format( prefix=self.source_vocab_name))): print("====Source File Exists====") else: count = 0 with tf.gfile.Open( os.path.join( tmp_dir, "{prefix:s}.corpus.txt".format( prefix=self.source_vocab_name)), "w") as f: for line in source_vocab_generator: f.write(line) f.write("\n") count += 1 print("====src:{src:d}=====".format(src=count)) if tf.gfile.Exists( os.path.join( tmp_dir, "{prefix:s}.corpus.txt".format( prefix=self.target_vocab_name))): print("====Source File Exists====") else: count = 0 with tf.gfile.Open( os.path.join( tmp_dir, "{prefix:s}.corpus.txt".format( prefix=self.target_vocab_name)), "w") as f: for line in target_vocab_generator: f.write(line) f.write("\n") count += 1 print("====target:{target:d}=====".format(target=count)) _ = SpmTextEncoder.build_from_file( output_dir=data_dir, filename=os.path.join( tmp_dir, "{prefix:s}.corpus.txt".format(prefix=self.source_vocab_name)), vocab_size=self.approx_vocab_size, model_prefix=self.source_vocab_name, reserved_tokens=kwargs['reserved_tokens'], model_type=kwargs["model_type"]) _ = SpmTextEncoder.build_from_file( output_dir=data_dir, filename=os.path.join( tmp_dir, "{prefix:s}.corpus.txt".format(prefix=self.target_vocab_name)), vocab_size=int(self.approx_vocab_size / 2), model_prefix=self.target_vocab_name, reserved_tokens=kwargs['reserved_tokens'], model_type=kwargs["model_type"])