Exemple #1
0
def _tmx_to_source_target(tmx_file, source_resfile, target_resfile,
                          do_cleaning=False):
  source_target_pairs = cleaner_en_xx.paracrawl_v3_pairs(tmx_file)
  if do_cleaning:
    source_target_pairs = cleaner_en_xx.clean_en_xx_pairs(source_target_pairs)
  for source, target in source_target_pairs:
    source_resfile.write(source)
    source_resfile.write("\n")
    target_resfile.write(target)
    target_resfile.write("\n")
Exemple #2
0
    def compile_data(self,
                     tmp_dir,
                     datasets,
                     filename,
                     datatypes_to_clean=None):
        """Concatenates all `datasets` and saves to `filename`."""
        datatypes_to_clean = datatypes_to_clean or []
        filename = os.path.join(tmp_dir, filename)
        lang1_fname = filename + ".lang1"
        lang2_fname = filename + ".lang2"
        if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
            tf.logging.info("Skipping compile data, found files:\n%s\n%s",
                            lang1_fname, lang2_fname)
            return filename
        with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
            with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
                for lang_filename in datasets:
                    lang_filepath = os.path.join(tmp_dir, lang_filename)
                    is_sgm = (lang_filename.endswith("sgm"))

                    for example in self.text2text_txt_tab_iterator(
                            lang_filepath):
                        line1res = translate._preprocess_sgm(
                            example["inputs"], is_sgm)
                        line2res = translate._preprocess_sgm(
                            example["targets"], is_sgm)
                        clean_pairs = [(line1res, line2res)]
                        if "txt" in datatypes_to_clean:
                            clean_pairs = cleaner_en_xx.clean_en_xx_pairs(
                                clean_pairs)
                        for line1res, line2res in clean_pairs:
                            if line1res and line2res:
                                lang1_resfile.write(line1res)
                                lang1_resfile.write("\n")
                                lang2_resfile.write(line2res)
                                lang2_resfile.write("\n")

        return filename
Exemple #3
0
def compile_data(tmp_dir, datasets, filename, datatypes_to_clean=None):
    """Concatenates all `datasets` and saves to `filename`."""
    datatypes_to_clean = datatypes_to_clean or []
    filename = os.path.join(tmp_dir, filename)
    lang1_fname = filename + ".lang1"
    lang2_fname = filename + ".lang2"
    if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
        tf.logging.info("Skipping compile data, found files:\n%s\n%s",
                        lang1_fname, lang2_fname)
        return filename
    with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
        with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
            for dataset in datasets:
                url = dataset[0]
                compressed_filename = os.path.basename(url)
                compressed_filepath = os.path.join(tmp_dir,
                                                   compressed_filename)
                if url.startswith("http"):
                    generator_utils.maybe_download(tmp_dir,
                                                   compressed_filename, url)
                if compressed_filename.endswith(".zip"):
                    zipfile.ZipFile(os.path.join(compressed_filepath),
                                    "r").extractall(tmp_dir)

                if dataset[1][0] == "tmx":
                    cleaning_requested = "tmx" in datatypes_to_clean
                    tmx_filename = os.path.join(tmp_dir, dataset[1][1])
                    if tmx_filename.endswith(".gz"):
                        with gzip.open(tmx_filename, "rb") as tmx_file:
                            _tmx_to_source_target(
                                tmx_file,
                                lang1_resfile,
                                lang2_resfile,
                                do_cleaning=cleaning_requested)
                    else:
                        with tf.gfile.Open(tmx_filename) as tmx_file:
                            _tmx_to_source_target(
                                tmx_file,
                                lang1_resfile,
                                lang2_resfile,
                                do_cleaning=cleaning_requested)

                elif dataset[1][0] == "tsv":
                    _, src_column, trg_column, glob_pattern = dataset[1]
                    filenames = tf.gfile.Glob(
                        os.path.join(tmp_dir, glob_pattern))
                    if not filenames:
                        # Capture *.tgz and *.tar.gz too.
                        mode = "r:gz" if compressed_filepath.endswith(
                            "gz") else "r"
                        with tarfile.open(compressed_filepath,
                                          mode) as corpus_tar:
                            corpus_tar.extractall(tmp_dir)
                        filenames = tf.gfile.Glob(
                            os.path.join(tmp_dir, glob_pattern))
                    for tsv_filename in filenames:
                        if tsv_filename.endswith(".gz"):
                            new_filename = tsv_filename.strip(".gz")
                            generator_utils.gunzip_file(
                                tsv_filename, new_filename)
                            tsv_filename = new_filename
                        with tf.gfile.Open(tsv_filename) as tsv_file:
                            for line in tsv_file:
                                if line and "\t" in line:
                                    parts = line.split("\t")
                                    source, target = parts[src_column], parts[
                                        trg_column]
                                    source, target = source.strip(
                                    ), target.strip()
                                    clean_pairs = [(source, target)]
                                    if "tsv" in datatypes_to_clean:
                                        clean_pairs = cleaner_en_xx.clean_en_xx_pairs(
                                            clean_pairs)
                                    for source, target in clean_pairs:
                                        if source and target:
                                            lang1_resfile.write(source)
                                            lang1_resfile.write("\n")
                                            lang2_resfile.write(target)
                                            lang2_resfile.write("\n")

                else:
                    lang1_filename, lang2_filename = dataset[1]
                    lang1_filepath = os.path.join(tmp_dir, lang1_filename)
                    lang2_filepath = os.path.join(tmp_dir, lang2_filename)
                    is_sgm = (lang1_filename.endswith("sgm")
                              and lang2_filename.endswith("sgm"))

                    if not (tf.gfile.Exists(lang1_filepath)
                            and tf.gfile.Exists(lang2_filepath)):
                        # For .tar.gz and .tgz files, we read compressed.
                        mode = "r:gz" if compressed_filepath.endswith(
                            "gz") else "r"
                        with tarfile.open(compressed_filepath,
                                          mode) as corpus_tar:
                            corpus_tar.extractall(tmp_dir)
                    if lang1_filepath.endswith(".gz"):
                        new_filepath = lang1_filepath.strip(".gz")
                        generator_utils.gunzip_file(lang1_filepath,
                                                    new_filepath)
                        lang1_filepath = new_filepath
                    if lang2_filepath.endswith(".gz"):
                        new_filepath = lang2_filepath.strip(".gz")
                        generator_utils.gunzip_file(lang2_filepath,
                                                    new_filepath)
                        lang2_filepath = new_filepath

                    for example in text_problems.text2text_txt_iterator(
                            lang1_filepath, lang2_filepath):
                        line1res = _preprocess_sgm(example["inputs"], is_sgm)
                        line2res = _preprocess_sgm(example["targets"], is_sgm)
                        clean_pairs = [(line1res, line2res)]
                        if "txt" in datatypes_to_clean:
                            clean_pairs = cleaner_en_xx.clean_en_xx_pairs(
                                clean_pairs)
                        for line1res, line2res in clean_pairs:
                            if line1res and line2res:
                                lang1_resfile.write(line1res)
                                lang1_resfile.write("\n")
                                lang2_resfile.write(line2res)
                                lang2_resfile.write("\n")

    return filename
Exemple #4
0
def _clean_sentences(sentence_pairs):
    res_pairs = []
    for cleaned in cleaner_en_xx.clean_en_xx_pairs(sentence_pairs):
        res_pairs.append(cleaned)
    return res_pairs