def generate_samples(self, data_dir, tmp_dir, dataset_split): # Thresholds in the number of characters for LM examples lo_thresh = 10 up_thresh = 256*8 if dataset_split == problem.DatasetSplit.TRAIN: (fname, fid) = self.train_name_id else: (fname, fid) = self.dev_name_id wikifiles = [] url = "https://drive.google.com/uc?export=download&id=" + fid download_path = generator_utils.maybe_download_from_drive( tmp_dir, fname, url) wiki_file = os.path.join(tmp_dir, fname[:-3]) if not tf.gfile.Exists(wiki_file): generator_utils.gunzip_file(download_path, wiki_file) wikifiles.append(wiki_file) txt = "" for wiki_file in wikifiles: for line in tf.gfile.Open(wiki_file): line = line.strip() if len(txt) + len(line) > up_thresh: ret = txt txt = "" if len(ret) > lo_thresh and len(ret) < up_thresh: yield {"targets": ret} if not txt: txt = line else: txt = " ".join([txt, line])
def _compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) lang1_lines, lang2_lines = [], [] for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) if not os.path.exists(compressed_filepath): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if not os.path.exists(lang1_filepath) or not os.path.exists( lang2_filepath): mode = "r:gz" if "gz" in compressed_filepath else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if ".gz" in lang1_filepath: new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if ".gz" in lang2_filepath: new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file: with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file: lang1_file_lines = lang1_file.readlines() lang2_file_lines = lang2_file.readlines() assert len(lang1_file_lines) == len( lang2_file_lines), lang1_filepath lang1_lines.extend(lang1_file_lines) lang2_lines.extend(lang2_file_lines) write_chunk_size = 10000 assert len(lang1_lines) == len(lang2_lines) with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file: i = 0 while i <= len(lang1_lines): for line in lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size]: lang1_file.write(line) i += 1 for line in lang1_lines[i * write_chunk_size:]: lang1_file.write(line) with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file: i = 0 while i <= len(lang2_lines): for line in lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size]: lang2_file.write(line) i += 1 for line in lang2_lines[i * write_chunk_size:]: lang2_file.write(line) return filename
def _compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) lang1_lines, lang2_lines = [], [] for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) if not os.path.exists(compressed_filepath): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if not os.path.exists(lang1_filepath) or not os.path.exists(lang2_filepath): mode = "r:gz" if "gz" in compressed_filepath else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if ".gz" in lang1_filepath: new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if ".gz" in lang2_filepath: new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file: with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file: lang1_file_lines = lang1_file.readlines() lang2_file_lines = lang2_file.readlines() assert len(lang1_file_lines) == len(lang2_file_lines), lang1_filepath lang1_lines.extend(lang1_file_lines) lang2_lines.extend(lang2_file_lines) write_chunk_size = 10000 assert len(lang1_lines) == len(lang2_lines) with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file: i = 0 while i <= len(lang1_lines): for line in lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size]: lang1_file.write(line) i += 1 for line in lang1_lines[i * write_chunk_size:]: lang1_file.write(line) with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file: i = 0 while i <= len(lang2_lines): for line in lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size]: lang2_file.write(line) i += 1 for line in lang2_lines[i * write_chunk_size:]: lang2_file.write(line) return filename
def _maybe_download_corpus(tmp_dir, vocab_type, dataset_url, dir_name): """Download and unpack the corpus. Args: tmp_dir: directory containing dataset. vocab_type: which vocabulary are we using. Returns: The list of names of files. """ # if vocab_type == text_problems.VocabType.CHARACTER: # # dataset_url = ("https://s3.amazonaws.com/research.metamind.io/wikitext" # "/wikitext-103-raw-v1.zip") # dir_name = "wikitext-103-raw" # else: # dataset_url = ("https://s3.amazonaws.com/research.metamind.io/wikitext" # "/wikitext-103-v1.zip") # dir_name = "wikitext-103" fname = os.path.basename(dataset_url) compressed_filepath = generator_utils.maybe_download( tmp_dir, fname, dataset_url) unpacked_dir = os.path.join(tmp_dir, dir_name) if not tf.gfile.Exists(unpacked_dir): tf.gfile.MakeDirs(unpacked_dir) unpacked_file = os.path.join( compressed_filepath, unpacked_dir + "/" + os.path.splitext(fname)[0]) generator_utils.gunzip_file(compressed_filepath, unpacked_file) txt = os.path.splitext(unpacked_file)[0] + ".txt" if not tf.gfile.Exists(txt): with open(unpacked_file, "rb") as jf, open(txt, "w") as wf: for line in jf: wf.write(json.loads(line)["reviewText"] + "\n") files = os.path.join(tmp_dir, dir_name, "*.txt") train_file, valid_file, test_file = None, None, None for f in tf.gfile.Glob(files): # fname = os.path.basename(f) # if "train" in fname: train_file = f # elif "valid" in fname: # valid_file = f # elif "test" in fname: # test_file = f # assert train_file, "Training file not found" # assert valid_file, "Validation file not found" # assert test_file, "Testing file not found" return train_file # , valid_file, test_file
def prepare_data(data_dir, tmp_dir, sources, out_filename="train.tok", use_jieba=True): """Preprocess dataset. Download, unarchive and preprocess. Skips processing if file exists. Writes to e.g. /data/t2t_datagen/train.tok.en """ for source in sources: url = source[0] filename = os.path.basename(url) compressed_file = maybe_download(tmp_dir, filename, url) for lang_file in source[1]: # pre-processed dataset path, e.g. train.tok.en lang = get_lang(lang_file) _pp = "%s.%s" % (out_filename, lang) tf.logging.info( "Reading file: %s, preprocessing to target file: %s" % (lang_file, _pp)) pp_filepath = os.path.join(data_dir, _pp) # unzip filepath = os.path.join(tmp_dir, lang_file) if not tf.gfile.Exists(filepath): read_type = "r:gz" if filename.endswith("tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: corpus_tar.extractall(tmp_dir) # For some datasets a second extraction is necessary. if lang_file.endswith(".gz"): new_filepath = os.path.join(tmp_dir, lang_file[:-3]) if tf.gfile.Exists(new_filepath): tf.logging.info( "Subdirectory %s already exists, skipping unpacking" % filepath) else: tf.logging.info("Unpacking subdirectory %s" % filepath) gunzip_file(filepath, new_filepath) filepath = new_filepath # read and clean each line, and write to target with tf.gfile.GFile(filepath, mode="r") as source_file: with tf.gfile.GFile(pp_filepath, mode="a") as out_file: is_zh = lang == "zh" is_zh = is_zh and use_jieba for line in source_file: line = _preprocess(line.strip(), is_zh) out_file.write(line + "\n")
def _compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile: with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) is_sgm = (lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) generator_utils.maybe_download(tmp_dir, compressed_filename, url) if not (os.path.exists(lang1_filepath) and os.path.exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith( "gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if lang1_filepath.endswith(".gz"): new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if lang2_filepath.endswith(".gz"): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file: with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file: line1, line2 = lang1_file.readline( ), lang2_file.readline() while line1 or line2: line1res = _preprocess_sgm(line1, is_sgm) line2res = _preprocess_sgm(line2, is_sgm) if line1res or line2res: lang1_resfile.write(line1res.strip() + "\n") lang2_resfile.write(line2res.strip() + "\n") line1, line2 = lang1_file.readline( ), lang2_file.readline() return filename
def _compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile: with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) if not os.path.exists(compressed_filepath): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if not (os.path.exists(lang1_filepath) and os.path.exists(lang2_filepath)): mode = "r:gz" if "gz" in compressed_filepath else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if ".gz" in lang1_filepath: new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if ".gz" in lang2_filepath: new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file: with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file: line1, line2 = lang1_file.readline( ), lang2_file.readline() while line1 or line2: lang1_resfile.write(line1.strip() + "\n") lang2_resfile.write(line2.strip() + "\n") line1, line2 = lang1_file.readline( ), lang2_file.readline() return filename
def testGunzipFile(self): tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) # Create a test zip file and unzip it. with gzip.open(tmp_file_path + ".gz", "wb") as gz_file: gz_file.write(bytes("test line", "utf-8")) generator_utils.gunzip_file(tmp_file_path + ".gz", tmp_file_path + ".txt") # Check that the unzipped result is as expected. lines = [] for line in io.open(tmp_file_path + ".txt", "rb"): lines.append(line.decode("utf-8").strip()) self.assertEqual(len(lines), 1) self.assertEqual(lines[0], "test line") # Clean up. os.remove(tmp_file_path + ".gz") os.remove(tmp_file_path + ".txt") os.remove(tmp_file_path)
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate samples.""" if dataset_split == problem.DatasetSplit.TRAIN: file_names_ids = self.train_names_ids elif dataset_split == problem.DatasetSplit.TEST: file_names_ids = self.test_names_ids else: file_names_ids = self.eval_names_ids wiki_generators = [] for (fname, fid) in file_names_ids: url = "https://drive.google.com/uc?export=download&id=" + fid download_path = generator_utils.maybe_download_from_drive( tmp_dir, fname, url) wiki_file = os.path.join(tmp_dir, fname[:-3]) if not tf.gfile.Exists(wiki_file): generator_utils.gunzip_file(download_path, wiki_file) wiki_generators.append( concat_generator(wiki_file, self.combine_characters_threshold)) for example in mix_generators(wiki_generators): yield example
def compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile: with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) if dataset[1][0] == "tsv": _, src_column, trg_column, glob_pattern = dataset[1] filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) if not filenames: # Capture *.tgz and *.tar.gz too. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) for tsv_filename in filenames: if tsv_filename.endswith(".gz"): new_filename = tsv_filename.strip(".gz") generator_utils.gunzip_file(tsv_filename, new_filename) tsv_filename = new_filename with tf.gfile.GFile(tsv_filename, mode="r") as tsv_file: for line in tsv_file: if line and "\t" in line: parts = line.split("\t") source, target = parts[src_column], parts[trg_column] lang1_resfile.write(source.strip() + "\n") lang2_resfile.write(target.strip() + "\n") else: lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) is_sgm = ( lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) if not (os.path.exists(lang1_filepath) and os.path.exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if lang1_filepath.endswith(".gz"): new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if lang2_filepath.endswith(".gz"): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file: with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file: line1, line2 = lang1_file.readline(), lang2_file.readline() while line1 or line2: line1res = _preprocess_sgm(line1, is_sgm) line2res = _preprocess_sgm(line2, is_sgm) if line1res or line2res: lang1_resfile.write(line1res.strip() + "\n") lang2_resfile.write(line2res.strip() + "\n") line1, line2 = lang1_file.readline(), lang2_file.readline() return filename
def compile_data(tmp_dir, datasets, filename, datatypes_to_clean=None): """Concatenates all `datasets` and saves to `filename`.""" datatypes_to_clean = datatypes_to_clean or [] filename = os.path.join(tmp_dir, filename) lang1_fname = filename + ".lang1" lang2_fname = filename + ".lang2" if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname) return filename with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) if url.startswith("http"): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if compressed_filename.endswith(".zip"): zipfile.ZipFile(os.path.join(compressed_filepath), "r").extractall(tmp_dir) if dataset[1][0] == "tmx": cleaning_requested = "tmx" in datatypes_to_clean tmx_filename = os.path.join(tmp_dir, dataset[1][1]) if tmx_filename.endswith(".gz"): with gzip.open(tmx_filename, "rb") as tmx_file: _tmx_to_source_target( tmx_file, lang1_resfile, lang2_resfile, do_cleaning=cleaning_requested) else: with tf.gfile.Open(tmx_filename) as tmx_file: _tmx_to_source_target( tmx_file, lang1_resfile, lang2_resfile, do_cleaning=cleaning_requested) elif dataset[1][0] == "tsv": _, src_column, trg_column, glob_pattern = dataset[1] filenames = tf.gfile.Glob( os.path.join(tmp_dir, glob_pattern)) if not filenames: # Capture *.tgz and *.tar.gz too. mode = "r:gz" if compressed_filepath.endswith( "gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) filenames = tf.gfile.Glob( os.path.join(tmp_dir, glob_pattern)) for tsv_filename in filenames: if tsv_filename.endswith(".gz"): new_filename = tsv_filename.strip(".gz") generator_utils.gunzip_file( tsv_filename, new_filename) tsv_filename = new_filename with tf.gfile.Open(tsv_filename) as tsv_file: for line in tsv_file: if line and "\t" in line: parts = line.split("\t") source, target = parts[src_column], parts[ trg_column] source, target = source.strip( ), target.strip() clean_pairs = [(source, target)] if "tsv" in datatypes_to_clean: clean_pairs = cleaner_en_xx.clean_en_xx_pairs( clean_pairs) for source, target in clean_pairs: if source and target: lang1_resfile.write(source) lang1_resfile.write("\n") lang2_resfile.write(target) lang2_resfile.write("\n") else: lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) is_sgm = (lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) if not (tf.gfile.Exists(lang1_filepath) and tf.gfile.Exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith( "gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if lang1_filepath.endswith(".gz"): new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if lang2_filepath.endswith(".gz"): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath for example in text_problems.text2text_txt_iterator( lang1_filepath, lang2_filepath): line1res = _preprocess_sgm(example["inputs"], is_sgm) line2res = _preprocess_sgm(example["targets"], is_sgm) clean_pairs = [(line1res, line2res)] if "txt" in datatypes_to_clean: clean_pairs = cleaner_en_xx.clean_en_xx_pairs( clean_pairs) for line1res, line2res in clean_pairs: if line1res and line2res: lang1_resfile.write(line1res) lang1_resfile.write("\n") lang2_resfile.write(line2res) lang2_resfile.write("\n") return filename
def compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) lang1_fname = filename + ".lang1" lang2_fname = filename + ".lang2" if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname) return filename with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) if url.startswith("http"): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if dataset[1][0] == "tmx": tmx_filename = os.path.join(tmp_dir, dataset[1][1]) if tmx_filename.endswith(".gz"): new_filename = tmx_filename.strip(".gz") if not tf.gfile.Exists(new_filename): generator_utils.gunzip_file( tmx_filename, new_filename) tmx_filename = new_filename source, target = None, None with tf.gfile.Open(tmx_filename) as tmx_file: for line in tmx_file: text = line.strip() if text.startswith("<seg>"): if text.endswith("</seg>"): sentence = text[ 5:-6] # Strip <seg> and </seg>. if source is None: source = sentence else: target = sentence if source is not None and target is not None: if source and target: # Prevent empty string examples. lang1_resfile.write(source) lang1_resfile.write("\n") lang2_resfile.write(target) lang2_resfile.write("\n") source, target = None, None elif dataset[1][0] == "tsv": _, src_column, trg_column, glob_pattern = dataset[1] filenames = tf.gfile.Glob( os.path.join(tmp_dir, glob_pattern)) if not filenames: # Capture *.tgz and *.tar.gz too. mode = "r:gz" if compressed_filepath.endswith( "gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) filenames = tf.gfile.Glob( os.path.join(tmp_dir, glob_pattern)) for tsv_filename in filenames: if tsv_filename.endswith(".gz"): new_filename = tsv_filename.strip(".gz") generator_utils.gunzip_file( tsv_filename, new_filename) tsv_filename = new_filename with tf.gfile.Open(tsv_filename) as tsv_file: for line in tsv_file: if line and "\t" in line: parts = line.split("\t") source, target = parts[src_column], parts[ trg_column] source, target = source.strip( ), target.strip() if source and target: lang1_resfile.write(source) lang1_resfile.write("\n") lang2_resfile.write(target) lang2_resfile.write("\n") else: lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) is_sgm = (lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) if not (tf.gfile.Exists(lang1_filepath) and tf.gfile.Exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith( "gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if lang1_filepath.endswith(".gz"): new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if lang2_filepath.endswith(".gz"): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath for example in text_problems.text2text_txt_iterator( lang1_filepath, lang2_filepath): line1res = _preprocess_sgm(example["inputs"], is_sgm) line2res = _preprocess_sgm(example["targets"], is_sgm) if line1res and line2res: lang1_resfile.write(line1res) lang1_resfile.write("\n") lang2_resfile.write(line2res) lang2_resfile.write("\n") return filename
def compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) lang1_fname = filename + ".lang1" lang2_fname = filename + ".lang2" if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname) return filename with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) if url.startswith("http"): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if dataset[1][0] == "tsv": _, src_column, trg_column, glob_pattern = dataset[1] filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) if not filenames: # Capture *.tgz and *.tar.gz too. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) for tsv_filename in filenames: if tsv_filename.endswith(".gz"): new_filename = tsv_filename.strip(".gz") generator_utils.gunzip_file(tsv_filename, new_filename) tsv_filename = new_filename with tf.gfile.Open(tsv_filename) as tsv_file: for line in tsv_file: if line and "\t" in line: parts = line.split("\t") source, target = parts[src_column], parts[trg_column] source, target = source.strip(), target.strip() if source and target: lang1_resfile.write(source) lang1_resfile.write("\n") lang2_resfile.write(target) lang2_resfile.write("\n") else: lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) is_sgm = ( lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) if not (tf.gfile.Exists(lang1_filepath) and tf.gfile.Exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if lang1_filepath.endswith(".gz"): new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if lang2_filepath.endswith(".gz"): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath for example in text_problems.text2text_txt_iterator( lang1_filepath, lang2_filepath): line1res = _preprocess_sgm(example["inputs"], is_sgm) line2res = _preprocess_sgm(example["targets"], is_sgm) if line1res and line2res: lang1_resfile.write(line1res) lang1_resfile.write("\n") lang2_resfile.write(line2res) lang2_resfile.write("\n") return filename
def compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) lang1_fname = filename + ".lang1" lang2_fname = filename + ".lang2" if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname) with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for dataset in datasets: # 每个dataset一个下载网址 url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) if url.startswith("http"): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if dataset[1][0] == "tsv": # 不知道神马东西,但是对我们也没有用,不用看 _, src_column, trg_column, glob_pattern = dataset[1] filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) if not filenames: # Capture *.tgz and *.tar.gz too. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) for tsv_filename in filenames: if tsv_filename.endswith(".gz"): new_filename = tsv_filename.strip(".gz") generator_utils.gunzip_file(tsv_filename, new_filename) tsv_filename = new_filename with tf.gfile.Open(tsv_filename) as tsv_file: for line in tsv_file: if line and "\t" in line: parts = line.split("\t") source, target = parts[src_column], parts[trg_column] source, target = source.strip(), target.strip() if source and target: lang1_resfile.write(source) lang1_resfile.write("\n") lang2_resfile.write(target) lang2_resfile.write("\n") else: # 这个是用得到的逻辑 lang1_filename, lang2_filename = dataset[1] # 获得两个语言的文件名,并拼接上路径 lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) is_sgm = (lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) # ???两个文件都以sgm结尾是啥意思 if not (tf.gfile.Exists(lang1_filepath) and tf.gfile.Exists(lang2_filepath)): # 解压下载的文件 # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if lang1_filepath.endswith(".gz"): # 如果解压后得到的小文件还是压缩文件,则再次解压 new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if lang2_filepath.endswith(".gz"): # 如果解压后得到的小文件还是压缩文件,则再次解压 new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath # 每个example是一个字典,两个键值对,{"inputs": inputs, "targets": targets} for example in text_problems.text2text_txt_iterator(lang1_filepath, lang2_filepath): # 下面这几行代码的逻辑是删掉辅助的信息,只留下文本 line1res = _preprocess_sgm(example["inputs"], is_sgm) line2res = _preprocess_sgm(example["targets"], is_sgm) if line1res and line2res: lang1_resfile.write(line1res) lang1_resfile.write("\n") lang2_resfile.write(line2res) lang2_resfile.write("\n") return filename