def _tmx_to_source_target(tmx_file, source_resfile, target_resfile, do_cleaning=False): source_target_pairs = cleaner_en_xx.paracrawl_v3_pairs(tmx_file) if do_cleaning: source_target_pairs = cleaner_en_xx.clean_en_xx_pairs(source_target_pairs) for source, target in source_target_pairs: source_resfile.write(source) source_resfile.write("\n") target_resfile.write(target) target_resfile.write("\n")
def compile_data(self, tmp_dir, datasets, filename, datatypes_to_clean=None): """Concatenates all `datasets` and saves to `filename`.""" datatypes_to_clean = datatypes_to_clean or [] filename = os.path.join(tmp_dir, filename) lang1_fname = filename + ".lang1" lang2_fname = filename + ".lang2" if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname) return filename with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for lang_filename in datasets: lang_filepath = os.path.join(tmp_dir, lang_filename) is_sgm = (lang_filename.endswith("sgm")) for example in self.text2text_txt_tab_iterator( lang_filepath): line1res = translate._preprocess_sgm( example["inputs"], is_sgm) line2res = translate._preprocess_sgm( example["targets"], is_sgm) clean_pairs = [(line1res, line2res)] if "txt" in datatypes_to_clean: clean_pairs = cleaner_en_xx.clean_en_xx_pairs( clean_pairs) for line1res, line2res in clean_pairs: if line1res and line2res: lang1_resfile.write(line1res) lang1_resfile.write("\n") lang2_resfile.write(line2res) lang2_resfile.write("\n") return filename
def compile_data(tmp_dir, datasets, filename, datatypes_to_clean=None): """Concatenates all `datasets` and saves to `filename`.""" datatypes_to_clean = datatypes_to_clean or [] filename = os.path.join(tmp_dir, filename) lang1_fname = filename + ".lang1" lang2_fname = filename + ".lang2" if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname) return filename with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) if url.startswith("http"): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if compressed_filename.endswith(".zip"): zipfile.ZipFile(os.path.join(compressed_filepath), "r").extractall(tmp_dir) if dataset[1][0] == "tmx": cleaning_requested = "tmx" in datatypes_to_clean tmx_filename = os.path.join(tmp_dir, dataset[1][1]) if tmx_filename.endswith(".gz"): with gzip.open(tmx_filename, "rb") as tmx_file: _tmx_to_source_target( tmx_file, lang1_resfile, lang2_resfile, do_cleaning=cleaning_requested) else: with tf.gfile.Open(tmx_filename) as tmx_file: _tmx_to_source_target( tmx_file, lang1_resfile, lang2_resfile, do_cleaning=cleaning_requested) elif dataset[1][0] == "tsv": _, src_column, trg_column, glob_pattern = dataset[1] filenames = tf.gfile.Glob( os.path.join(tmp_dir, glob_pattern)) if not filenames: # Capture *.tgz and *.tar.gz too. mode = "r:gz" if compressed_filepath.endswith( "gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) filenames = tf.gfile.Glob( os.path.join(tmp_dir, glob_pattern)) for tsv_filename in filenames: if tsv_filename.endswith(".gz"): new_filename = tsv_filename.strip(".gz") generator_utils.gunzip_file( tsv_filename, new_filename) tsv_filename = new_filename with tf.gfile.Open(tsv_filename) as tsv_file: for line in tsv_file: if line and "\t" in line: parts = line.split("\t") source, target = parts[src_column], parts[ trg_column] source, target = source.strip( ), target.strip() clean_pairs = [(source, target)] if "tsv" in datatypes_to_clean: clean_pairs = cleaner_en_xx.clean_en_xx_pairs( clean_pairs) for source, target in clean_pairs: if source and target: lang1_resfile.write(source) lang1_resfile.write("\n") lang2_resfile.write(target) lang2_resfile.write("\n") else: lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) is_sgm = (lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) if not (tf.gfile.Exists(lang1_filepath) and tf.gfile.Exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith( "gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if lang1_filepath.endswith(".gz"): new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if lang2_filepath.endswith(".gz"): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath for example in text_problems.text2text_txt_iterator( lang1_filepath, lang2_filepath): line1res = _preprocess_sgm(example["inputs"], is_sgm) line2res = _preprocess_sgm(example["targets"], is_sgm) clean_pairs = [(line1res, line2res)] if "txt" in datatypes_to_clean: clean_pairs = cleaner_en_xx.clean_en_xx_pairs( clean_pairs) for line1res, line2res in clean_pairs: if line1res and line2res: lang1_resfile.write(line1res) lang1_resfile.write("\n") lang2_resfile.write(line2res) lang2_resfile.write("\n") return filename
def _clean_sentences(sentence_pairs): res_pairs = [] for cleaned in cleaner_en_xx.clean_en_xx_pairs(sentence_pairs): res_pairs.append(cleaned) return res_pairs