Exemple #1
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    # Thresholds in the number of characters for LM examples
    lo_thresh = 10
    up_thresh = 256*8

    if dataset_split == problem.DatasetSplit.TRAIN:
      (fname, fid) = self.train_name_id
    else:
      (fname, fid) = self.dev_name_id

    wikifiles = []
    url = "https://drive.google.com/uc?export=download&id=" + fid
    download_path = generator_utils.maybe_download_from_drive(
        tmp_dir, fname, url)
    wiki_file = os.path.join(tmp_dir, fname[:-3])
    if not tf.gfile.Exists(wiki_file):
      generator_utils.gunzip_file(download_path, wiki_file)
    wikifiles.append(wiki_file)

    txt = ""
    for wiki_file in wikifiles:
      for line in tf.gfile.Open(wiki_file):
        line = line.strip()
        if len(txt) + len(line) > up_thresh:
          ret = txt
          txt = ""
          if len(ret) > lo_thresh and len(ret) < up_thresh:
            yield {"targets": ret}

        if not txt:
          txt = line
        else:
          txt = " ".join([txt, line])
Exemple #2
0
def _compile_data(tmp_dir, datasets, filename):
    """Concatenate all `datasets` and save to `filename`."""
    filename = os.path.join(tmp_dir, filename)
    lang1_lines, lang2_lines = [], []
    for dataset in datasets:
        url = dataset[0]
        compressed_filename = os.path.basename(url)
        compressed_filepath = os.path.join(tmp_dir, compressed_filename)

        lang1_filename, lang2_filename = dataset[1]
        lang1_filepath = os.path.join(tmp_dir, lang1_filename)
        lang2_filepath = os.path.join(tmp_dir, lang2_filename)

        if not os.path.exists(compressed_filepath):
            generator_utils.maybe_download(tmp_dir, compressed_filename, url)
        if not os.path.exists(lang1_filepath) or not os.path.exists(
                lang2_filepath):
            mode = "r:gz" if "gz" in compressed_filepath else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
                corpus_tar.extractall(tmp_dir)
        if ".gz" in lang1_filepath:
            new_filepath = lang1_filepath.strip(".gz")
            generator_utils.gunzip_file(lang1_filepath, new_filepath)
            lang1_filepath = new_filepath
        if ".gz" in lang2_filepath:
            new_filepath = lang2_filepath.strip(".gz")
            generator_utils.gunzip_file(lang2_filepath, new_filepath)
            lang2_filepath = new_filepath
        with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
            with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
                lang1_file_lines = lang1_file.readlines()
                lang2_file_lines = lang2_file.readlines()
                assert len(lang1_file_lines) == len(
                    lang2_file_lines), lang1_filepath
                lang1_lines.extend(lang1_file_lines)
                lang2_lines.extend(lang2_file_lines)

    write_chunk_size = 10000
    assert len(lang1_lines) == len(lang2_lines)
    with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file:
        i = 0
        while i <= len(lang1_lines):
            for line in lang1_lines[i * write_chunk_size:(i + 1) *
                                    write_chunk_size]:
                lang1_file.write(line)
            i += 1
        for line in lang1_lines[i * write_chunk_size:]:
            lang1_file.write(line)
    with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file:
        i = 0
        while i <= len(lang2_lines):
            for line in lang2_lines[i * write_chunk_size:(i + 1) *
                                    write_chunk_size]:
                lang2_file.write(line)
            i += 1
        for line in lang2_lines[i * write_chunk_size:]:
            lang2_file.write(line)
    return filename
Exemple #3
0
def _compile_data(tmp_dir, datasets, filename):
  """Concatenate all `datasets` and save to `filename`."""
  filename = os.path.join(tmp_dir, filename)
  lang1_lines, lang2_lines = [], []
  for dataset in datasets:
    url = dataset[0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)

    lang1_filename, lang2_filename = dataset[1]
    lang1_filepath = os.path.join(tmp_dir, lang1_filename)
    lang2_filepath = os.path.join(tmp_dir, lang2_filename)

    if not os.path.exists(compressed_filepath):
      generator_utils.maybe_download(tmp_dir, compressed_filename, url)
    if not os.path.exists(lang1_filepath) or not os.path.exists(lang2_filepath):
      mode = "r:gz" if "gz" in compressed_filepath else "r"
      with tarfile.open(compressed_filepath, mode) as corpus_tar:
        corpus_tar.extractall(tmp_dir)
    if ".gz" in lang1_filepath:
      new_filepath = lang1_filepath.strip(".gz")
      generator_utils.gunzip_file(lang1_filepath, new_filepath)
      lang1_filepath = new_filepath
    if ".gz" in lang2_filepath:
      new_filepath = lang2_filepath.strip(".gz")
      generator_utils.gunzip_file(lang2_filepath, new_filepath)
      lang2_filepath = new_filepath
    with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
      with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
        lang1_file_lines = lang1_file.readlines()
        lang2_file_lines = lang2_file.readlines()
        assert len(lang1_file_lines) == len(lang2_file_lines), lang1_filepath
        lang1_lines.extend(lang1_file_lines)
        lang2_lines.extend(lang2_file_lines)

  write_chunk_size = 10000
  assert len(lang1_lines) == len(lang2_lines)
  with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file:
    i = 0
    while i <= len(lang1_lines):
      for line in lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size]:
        lang1_file.write(line)
      i += 1
    for line in lang1_lines[i * write_chunk_size:]:
      lang1_file.write(line)
  with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file:
    i = 0
    while i <= len(lang2_lines):
      for line in lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size]:
        lang2_file.write(line)
      i += 1
    for line in lang2_lines[i * write_chunk_size:]:
      lang2_file.write(line)
  return filename
Exemple #4
0
def _maybe_download_corpus(tmp_dir, vocab_type, dataset_url, dir_name):
    """Download and unpack the corpus.

    Args:
      tmp_dir: directory containing dataset.
      vocab_type: which vocabulary are we using.

    Returns:
      The list of names of files.
    """
    #     if vocab_type == text_problems.VocabType.CHARACTER:
    #
    #         dataset_url = ("https://s3.amazonaws.com/research.metamind.io/wikitext"
    #                        "/wikitext-103-raw-v1.zip")
    #         dir_name = "wikitext-103-raw"
    #     else:
    #         dataset_url = ("https://s3.amazonaws.com/research.metamind.io/wikitext"
    #                        "/wikitext-103-v1.zip")
    #         dir_name = "wikitext-103"

    fname = os.path.basename(dataset_url)
    compressed_filepath = generator_utils.maybe_download(
        tmp_dir, fname, dataset_url)

    unpacked_dir = os.path.join(tmp_dir, dir_name)
    if not tf.gfile.Exists(unpacked_dir):
        tf.gfile.MakeDirs(unpacked_dir)
    unpacked_file = os.path.join(
        compressed_filepath, unpacked_dir + "/" + os.path.splitext(fname)[0])
    generator_utils.gunzip_file(compressed_filepath, unpacked_file)
    txt = os.path.splitext(unpacked_file)[0] + ".txt"
    if not tf.gfile.Exists(txt):
        with open(unpacked_file, "rb") as jf, open(txt, "w") as wf:
            for line in jf:
                wf.write(json.loads(line)["reviewText"] + "\n")
    files = os.path.join(tmp_dir, dir_name, "*.txt")
    train_file, valid_file, test_file = None, None, None
    for f in tf.gfile.Glob(files):
        #         fname = os.path.basename(f)
        #         if "train" in fname:
        train_file = f


#         elif "valid" in fname:
#             valid_file = f
#         elif "test" in fname:
#             test_file = f

#     assert train_file, "Training file not found"
#     assert valid_file, "Validation file not found"
#     assert test_file, "Testing file not found"

    return train_file  # , valid_file, test_file
Exemple #5
0
def prepare_data(data_dir,
                 tmp_dir,
                 sources,
                 out_filename="train.tok",
                 use_jieba=True):
    """Preprocess dataset. Download, unarchive and preprocess. 
    Skips processing if file exists. 
    Writes to e.g. /data/t2t_datagen/train.tok.en
    """

    for source in sources:
        url = source[0]
        filename = os.path.basename(url)
        compressed_file = maybe_download(tmp_dir, filename, url)

        for lang_file in source[1]:
            # pre-processed dataset path, e.g. train.tok.en
            lang = get_lang(lang_file)
            _pp = "%s.%s" % (out_filename, lang)
            tf.logging.info(
                "Reading file: %s, preprocessing to target file: %s" %
                (lang_file, _pp))
            pp_filepath = os.path.join(data_dir, _pp)

            # unzip
            filepath = os.path.join(tmp_dir, lang_file)
            if not tf.gfile.Exists(filepath):
                read_type = "r:gz" if filename.endswith("tgz") else "r"
                with tarfile.open(compressed_file, read_type) as corpus_tar:
                    corpus_tar.extractall(tmp_dir)

            # For some datasets a second extraction is necessary.
            if lang_file.endswith(".gz"):
                new_filepath = os.path.join(tmp_dir, lang_file[:-3])
                if tf.gfile.Exists(new_filepath):
                    tf.logging.info(
                        "Subdirectory %s already exists, skipping unpacking" %
                        filepath)
                else:
                    tf.logging.info("Unpacking subdirectory %s" % filepath)
                    gunzip_file(filepath, new_filepath)
                filepath = new_filepath

            # read and clean each line, and write to target
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                with tf.gfile.GFile(pp_filepath, mode="a") as out_file:
                    is_zh = lang == "zh"
                    is_zh = is_zh and use_jieba
                    for line in source_file:
                        line = _preprocess(line.strip(), is_zh)
                        out_file.write(line + "\n")
Exemple #6
0
def _compile_data(tmp_dir, datasets, filename):
    """Concatenate all `datasets` and save to `filename`."""
    filename = os.path.join(tmp_dir, filename)
    with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile:
        with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile:
            for dataset in datasets:
                url = dataset[0]
                compressed_filename = os.path.basename(url)
                compressed_filepath = os.path.join(tmp_dir,
                                                   compressed_filename)

                lang1_filename, lang2_filename = dataset[1]
                lang1_filepath = os.path.join(tmp_dir, lang1_filename)
                lang2_filepath = os.path.join(tmp_dir, lang2_filename)
                is_sgm = (lang1_filename.endswith("sgm")
                          and lang2_filename.endswith("sgm"))

                generator_utils.maybe_download(tmp_dir, compressed_filename,
                                               url)
                if not (os.path.exists(lang1_filepath)
                        and os.path.exists(lang2_filepath)):
                    # For .tar.gz and .tgz files, we read compressed.
                    mode = "r:gz" if compressed_filepath.endswith(
                        "gz") else "r"
                    with tarfile.open(compressed_filepath, mode) as corpus_tar:
                        corpus_tar.extractall(tmp_dir)
                if lang1_filepath.endswith(".gz"):
                    new_filepath = lang1_filepath.strip(".gz")
                    generator_utils.gunzip_file(lang1_filepath, new_filepath)
                    lang1_filepath = new_filepath
                if lang2_filepath.endswith(".gz"):
                    new_filepath = lang2_filepath.strip(".gz")
                    generator_utils.gunzip_file(lang2_filepath, new_filepath)
                    lang2_filepath = new_filepath
                with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
                    with tf.gfile.GFile(lang2_filepath,
                                        mode="r") as lang2_file:
                        line1, line2 = lang1_file.readline(
                        ), lang2_file.readline()
                        while line1 or line2:
                            line1res = _preprocess_sgm(line1, is_sgm)
                            line2res = _preprocess_sgm(line2, is_sgm)
                            if line1res or line2res:
                                lang1_resfile.write(line1res.strip() + "\n")
                                lang2_resfile.write(line2res.strip() + "\n")
                            line1, line2 = lang1_file.readline(
                            ), lang2_file.readline()

    return filename
Exemple #7
0
def _compile_data(tmp_dir, datasets, filename):
    """Concatenate all `datasets` and save to `filename`."""
    filename = os.path.join(tmp_dir, filename)
    with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile:
        with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile:
            for dataset in datasets:
                url = dataset[0]
                compressed_filename = os.path.basename(url)
                compressed_filepath = os.path.join(tmp_dir,
                                                   compressed_filename)

                lang1_filename, lang2_filename = dataset[1]
                lang1_filepath = os.path.join(tmp_dir, lang1_filename)
                lang2_filepath = os.path.join(tmp_dir, lang2_filename)

                if not os.path.exists(compressed_filepath):
                    generator_utils.maybe_download(tmp_dir,
                                                   compressed_filename, url)
                if not (os.path.exists(lang1_filepath)
                        and os.path.exists(lang2_filepath)):
                    mode = "r:gz" if "gz" in compressed_filepath else "r"
                    with tarfile.open(compressed_filepath, mode) as corpus_tar:
                        corpus_tar.extractall(tmp_dir)
                if ".gz" in lang1_filepath:
                    new_filepath = lang1_filepath.strip(".gz")
                    generator_utils.gunzip_file(lang1_filepath, new_filepath)
                    lang1_filepath = new_filepath
                if ".gz" in lang2_filepath:
                    new_filepath = lang2_filepath.strip(".gz")
                    generator_utils.gunzip_file(lang2_filepath, new_filepath)
                    lang2_filepath = new_filepath
                with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
                    with tf.gfile.GFile(lang2_filepath,
                                        mode="r") as lang2_file:
                        line1, line2 = lang1_file.readline(
                        ), lang2_file.readline()
                        while line1 or line2:
                            lang1_resfile.write(line1.strip() + "\n")
                            lang2_resfile.write(line2.strip() + "\n")
                            line1, line2 = lang1_file.readline(
                            ), lang2_file.readline()

    return filename
  def testGunzipFile(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)

    # Create a test zip file and unzip it.
    with gzip.open(tmp_file_path + ".gz", "wb") as gz_file:
      gz_file.write(bytes("test line", "utf-8"))
    generator_utils.gunzip_file(tmp_file_path + ".gz", tmp_file_path + ".txt")

    # Check that the unzipped result is as expected.
    lines = []
    for line in io.open(tmp_file_path + ".txt", "rb"):
      lines.append(line.decode("utf-8").strip())
    self.assertEqual(len(lines), 1)
    self.assertEqual(lines[0], "test line")

    # Clean up.
    os.remove(tmp_file_path + ".gz")
    os.remove(tmp_file_path + ".txt")
    os.remove(tmp_file_path)
  def testGunzipFile(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)

    # Create a test zip file and unzip it.
    with gzip.open(tmp_file_path + ".gz", "wb") as gz_file:
      gz_file.write(bytes("test line", "utf-8"))
    generator_utils.gunzip_file(tmp_file_path + ".gz", tmp_file_path + ".txt")

    # Check that the unzipped result is as expected.
    lines = []
    for line in io.open(tmp_file_path + ".txt", "rb"):
      lines.append(line.decode("utf-8").strip())
    self.assertEqual(len(lines), 1)
    self.assertEqual(lines[0], "test line")

    # Clean up.
    os.remove(tmp_file_path + ".gz")
    os.remove(tmp_file_path + ".txt")
    os.remove(tmp_file_path)
Exemple #10
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        """Generate samples."""
        if dataset_split == problem.DatasetSplit.TRAIN:
            file_names_ids = self.train_names_ids
        elif dataset_split == problem.DatasetSplit.TEST:
            file_names_ids = self.test_names_ids
        else:
            file_names_ids = self.eval_names_ids

        wiki_generators = []
        for (fname, fid) in file_names_ids:
            url = "https://drive.google.com/uc?export=download&id=" + fid
            download_path = generator_utils.maybe_download_from_drive(
                tmp_dir, fname, url)
            wiki_file = os.path.join(tmp_dir, fname[:-3])
            if not tf.gfile.Exists(wiki_file):
                generator_utils.gunzip_file(download_path, wiki_file)
            wiki_generators.append(
                concat_generator(wiki_file, self.combine_characters_threshold))

        for example in mix_generators(wiki_generators):
            yield example
Exemple #11
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Generate samples."""
    if dataset_split == problem.DatasetSplit.TRAIN:
      file_names_ids = self.train_names_ids
    elif dataset_split == problem.DatasetSplit.TEST:
      file_names_ids = self.test_names_ids
    else:
      file_names_ids = self.eval_names_ids

    wiki_generators = []
    for (fname, fid) in file_names_ids:
      url = "https://drive.google.com/uc?export=download&id=" + fid
      download_path = generator_utils.maybe_download_from_drive(
          tmp_dir, fname, url)
      wiki_file = os.path.join(tmp_dir, fname[:-3])
      if not tf.gfile.Exists(wiki_file):
        generator_utils.gunzip_file(download_path, wiki_file)
      wiki_generators.append(
          concat_generator(wiki_file, self.combine_characters_threshold))

    for example in mix_generators(wiki_generators):
      yield example
def compile_data(tmp_dir, datasets, filename):
  """Concatenate all `datasets` and save to `filename`."""
  filename = os.path.join(tmp_dir, filename)
  with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile:
    with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile:
      for dataset in datasets:
        url = dataset[0]
        compressed_filename = os.path.basename(url)
        compressed_filepath = os.path.join(tmp_dir, compressed_filename)

        generator_utils.maybe_download(tmp_dir, compressed_filename, url)

        if dataset[1][0] == "tsv":
          _, src_column, trg_column, glob_pattern = dataset[1]
          filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          if not filenames:
            # Capture *.tgz and *.tar.gz too.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
            filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          for tsv_filename in filenames:
            if tsv_filename.endswith(".gz"):
              new_filename = tsv_filename.strip(".gz")
              generator_utils.gunzip_file(tsv_filename, new_filename)
              tsv_filename = new_filename
            with tf.gfile.GFile(tsv_filename, mode="r") as tsv_file:
              for line in tsv_file:
                if line and "\t" in line:
                  parts = line.split("\t")
                  source, target = parts[src_column], parts[trg_column]
                  lang1_resfile.write(source.strip() + "\n")
                  lang2_resfile.write(target.strip() + "\n")
        else:
          lang1_filename, lang2_filename = dataset[1]
          lang1_filepath = os.path.join(tmp_dir, lang1_filename)
          lang2_filepath = os.path.join(tmp_dir, lang2_filename)
          is_sgm = (
              lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm"))

          if not (os.path.exists(lang1_filepath) and
                  os.path.exists(lang2_filepath)):
            # For .tar.gz and .tgz files, we read compressed.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
          if lang1_filepath.endswith(".gz"):
            new_filepath = lang1_filepath.strip(".gz")
            generator_utils.gunzip_file(lang1_filepath, new_filepath)
            lang1_filepath = new_filepath
          if lang2_filepath.endswith(".gz"):
            new_filepath = lang2_filepath.strip(".gz")
            generator_utils.gunzip_file(lang2_filepath, new_filepath)
            lang2_filepath = new_filepath
          with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
            with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
              line1, line2 = lang1_file.readline(), lang2_file.readline()
              while line1 or line2:
                line1res = _preprocess_sgm(line1, is_sgm)
                line2res = _preprocess_sgm(line2, is_sgm)
                if line1res or line2res:
                  lang1_resfile.write(line1res.strip() + "\n")
                  lang2_resfile.write(line2res.strip() + "\n")
                line1, line2 = lang1_file.readline(), lang2_file.readline()

  return filename
Exemple #13
0
def compile_data(tmp_dir, datasets, filename, datatypes_to_clean=None):
    """Concatenates all `datasets` and saves to `filename`."""
    datatypes_to_clean = datatypes_to_clean or []
    filename = os.path.join(tmp_dir, filename)
    lang1_fname = filename + ".lang1"
    lang2_fname = filename + ".lang2"
    if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
        tf.logging.info("Skipping compile data, found files:\n%s\n%s",
                        lang1_fname, lang2_fname)
        return filename
    with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
        with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
            for dataset in datasets:
                url = dataset[0]
                compressed_filename = os.path.basename(url)
                compressed_filepath = os.path.join(tmp_dir,
                                                   compressed_filename)
                if url.startswith("http"):
                    generator_utils.maybe_download(tmp_dir,
                                                   compressed_filename, url)
                if compressed_filename.endswith(".zip"):
                    zipfile.ZipFile(os.path.join(compressed_filepath),
                                    "r").extractall(tmp_dir)

                if dataset[1][0] == "tmx":
                    cleaning_requested = "tmx" in datatypes_to_clean
                    tmx_filename = os.path.join(tmp_dir, dataset[1][1])
                    if tmx_filename.endswith(".gz"):
                        with gzip.open(tmx_filename, "rb") as tmx_file:
                            _tmx_to_source_target(
                                tmx_file,
                                lang1_resfile,
                                lang2_resfile,
                                do_cleaning=cleaning_requested)
                    else:
                        with tf.gfile.Open(tmx_filename) as tmx_file:
                            _tmx_to_source_target(
                                tmx_file,
                                lang1_resfile,
                                lang2_resfile,
                                do_cleaning=cleaning_requested)

                elif dataset[1][0] == "tsv":
                    _, src_column, trg_column, glob_pattern = dataset[1]
                    filenames = tf.gfile.Glob(
                        os.path.join(tmp_dir, glob_pattern))
                    if not filenames:
                        # Capture *.tgz and *.tar.gz too.
                        mode = "r:gz" if compressed_filepath.endswith(
                            "gz") else "r"
                        with tarfile.open(compressed_filepath,
                                          mode) as corpus_tar:
                            corpus_tar.extractall(tmp_dir)
                        filenames = tf.gfile.Glob(
                            os.path.join(tmp_dir, glob_pattern))
                    for tsv_filename in filenames:
                        if tsv_filename.endswith(".gz"):
                            new_filename = tsv_filename.strip(".gz")
                            generator_utils.gunzip_file(
                                tsv_filename, new_filename)
                            tsv_filename = new_filename
                        with tf.gfile.Open(tsv_filename) as tsv_file:
                            for line in tsv_file:
                                if line and "\t" in line:
                                    parts = line.split("\t")
                                    source, target = parts[src_column], parts[
                                        trg_column]
                                    source, target = source.strip(
                                    ), target.strip()
                                    clean_pairs = [(source, target)]
                                    if "tsv" in datatypes_to_clean:
                                        clean_pairs = cleaner_en_xx.clean_en_xx_pairs(
                                            clean_pairs)
                                    for source, target in clean_pairs:
                                        if source and target:
                                            lang1_resfile.write(source)
                                            lang1_resfile.write("\n")
                                            lang2_resfile.write(target)
                                            lang2_resfile.write("\n")

                else:
                    lang1_filename, lang2_filename = dataset[1]
                    lang1_filepath = os.path.join(tmp_dir, lang1_filename)
                    lang2_filepath = os.path.join(tmp_dir, lang2_filename)
                    is_sgm = (lang1_filename.endswith("sgm")
                              and lang2_filename.endswith("sgm"))

                    if not (tf.gfile.Exists(lang1_filepath)
                            and tf.gfile.Exists(lang2_filepath)):
                        # For .tar.gz and .tgz files, we read compressed.
                        mode = "r:gz" if compressed_filepath.endswith(
                            "gz") else "r"
                        with tarfile.open(compressed_filepath,
                                          mode) as corpus_tar:
                            corpus_tar.extractall(tmp_dir)
                    if lang1_filepath.endswith(".gz"):
                        new_filepath = lang1_filepath.strip(".gz")
                        generator_utils.gunzip_file(lang1_filepath,
                                                    new_filepath)
                        lang1_filepath = new_filepath
                    if lang2_filepath.endswith(".gz"):
                        new_filepath = lang2_filepath.strip(".gz")
                        generator_utils.gunzip_file(lang2_filepath,
                                                    new_filepath)
                        lang2_filepath = new_filepath

                    for example in text_problems.text2text_txt_iterator(
                            lang1_filepath, lang2_filepath):
                        line1res = _preprocess_sgm(example["inputs"], is_sgm)
                        line2res = _preprocess_sgm(example["targets"], is_sgm)
                        clean_pairs = [(line1res, line2res)]
                        if "txt" in datatypes_to_clean:
                            clean_pairs = cleaner_en_xx.clean_en_xx_pairs(
                                clean_pairs)
                        for line1res, line2res in clean_pairs:
                            if line1res and line2res:
                                lang1_resfile.write(line1res)
                                lang1_resfile.write("\n")
                                lang2_resfile.write(line2res)
                                lang2_resfile.write("\n")

    return filename
Exemple #14
0
def compile_data(tmp_dir, datasets, filename):
  """Concatenate all `datasets` and save to `filename`."""
  filename = os.path.join(tmp_dir, filename)
  with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile:
    with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile:
      for dataset in datasets:
        url = dataset[0]
        compressed_filename = os.path.basename(url)
        compressed_filepath = os.path.join(tmp_dir, compressed_filename)

        generator_utils.maybe_download(tmp_dir, compressed_filename, url)

        if dataset[1][0] == "tsv":
          _, src_column, trg_column, glob_pattern = dataset[1]
          filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          if not filenames:
            # Capture *.tgz and *.tar.gz too.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
            filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          for tsv_filename in filenames:
            if tsv_filename.endswith(".gz"):
              new_filename = tsv_filename.strip(".gz")
              generator_utils.gunzip_file(tsv_filename, new_filename)
              tsv_filename = new_filename
            with tf.gfile.GFile(tsv_filename, mode="r") as tsv_file:
              for line in tsv_file:
                if line and "\t" in line:
                  parts = line.split("\t")
                  source, target = parts[src_column], parts[trg_column]
                  lang1_resfile.write(source.strip() + "\n")
                  lang2_resfile.write(target.strip() + "\n")
        else:
          lang1_filename, lang2_filename = dataset[1]
          lang1_filepath = os.path.join(tmp_dir, lang1_filename)
          lang2_filepath = os.path.join(tmp_dir, lang2_filename)
          is_sgm = (
              lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm"))

          if not (os.path.exists(lang1_filepath) and
                  os.path.exists(lang2_filepath)):
            # For .tar.gz and .tgz files, we read compressed.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
          if lang1_filepath.endswith(".gz"):
            new_filepath = lang1_filepath.strip(".gz")
            generator_utils.gunzip_file(lang1_filepath, new_filepath)
            lang1_filepath = new_filepath
          if lang2_filepath.endswith(".gz"):
            new_filepath = lang2_filepath.strip(".gz")
            generator_utils.gunzip_file(lang2_filepath, new_filepath)
            lang2_filepath = new_filepath
          with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
            with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
              line1, line2 = lang1_file.readline(), lang2_file.readline()
              while line1 or line2:
                line1res = _preprocess_sgm(line1, is_sgm)
                line2res = _preprocess_sgm(line2, is_sgm)
                if line1res or line2res:
                  lang1_resfile.write(line1res.strip() + "\n")
                  lang2_resfile.write(line2res.strip() + "\n")
                line1, line2 = lang1_file.readline(), lang2_file.readline()

  return filename
Exemple #15
0
def compile_data(tmp_dir, datasets, filename):
    """Concatenate all `datasets` and save to `filename`."""
    filename = os.path.join(tmp_dir, filename)
    lang1_fname = filename + ".lang1"
    lang2_fname = filename + ".lang2"
    if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
        tf.logging.info("Skipping compile data, found files:\n%s\n%s",
                        lang1_fname, lang2_fname)
        return filename
    with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
        with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
            for dataset in datasets:
                url = dataset[0]
                compressed_filename = os.path.basename(url)
                compressed_filepath = os.path.join(tmp_dir,
                                                   compressed_filename)
                if url.startswith("http"):
                    generator_utils.maybe_download(tmp_dir,
                                                   compressed_filename, url)

                if dataset[1][0] == "tmx":
                    tmx_filename = os.path.join(tmp_dir, dataset[1][1])
                    if tmx_filename.endswith(".gz"):
                        new_filename = tmx_filename.strip(".gz")
                        if not tf.gfile.Exists(new_filename):
                            generator_utils.gunzip_file(
                                tmx_filename, new_filename)
                        tmx_filename = new_filename
                    source, target = None, None
                    with tf.gfile.Open(tmx_filename) as tmx_file:
                        for line in tmx_file:
                            text = line.strip()
                            if text.startswith("<seg>"):
                                if text.endswith("</seg>"):
                                    sentence = text[
                                        5:-6]  # Strip <seg> and </seg>.
                                    if source is None:
                                        source = sentence
                                    else:
                                        target = sentence
                            if source is not None and target is not None:
                                if source and target:  # Prevent empty string examples.
                                    lang1_resfile.write(source)
                                    lang1_resfile.write("\n")
                                    lang2_resfile.write(target)
                                    lang2_resfile.write("\n")
                                source, target = None, None

                elif dataset[1][0] == "tsv":
                    _, src_column, trg_column, glob_pattern = dataset[1]
                    filenames = tf.gfile.Glob(
                        os.path.join(tmp_dir, glob_pattern))
                    if not filenames:
                        # Capture *.tgz and *.tar.gz too.
                        mode = "r:gz" if compressed_filepath.endswith(
                            "gz") else "r"
                        with tarfile.open(compressed_filepath,
                                          mode) as corpus_tar:
                            corpus_tar.extractall(tmp_dir)
                        filenames = tf.gfile.Glob(
                            os.path.join(tmp_dir, glob_pattern))
                    for tsv_filename in filenames:
                        if tsv_filename.endswith(".gz"):
                            new_filename = tsv_filename.strip(".gz")
                            generator_utils.gunzip_file(
                                tsv_filename, new_filename)
                            tsv_filename = new_filename
                        with tf.gfile.Open(tsv_filename) as tsv_file:
                            for line in tsv_file:
                                if line and "\t" in line:
                                    parts = line.split("\t")
                                    source, target = parts[src_column], parts[
                                        trg_column]
                                    source, target = source.strip(
                                    ), target.strip()
                                    if source and target:
                                        lang1_resfile.write(source)
                                        lang1_resfile.write("\n")
                                        lang2_resfile.write(target)
                                        lang2_resfile.write("\n")

                else:
                    lang1_filename, lang2_filename = dataset[1]
                    lang1_filepath = os.path.join(tmp_dir, lang1_filename)
                    lang2_filepath = os.path.join(tmp_dir, lang2_filename)
                    is_sgm = (lang1_filename.endswith("sgm")
                              and lang2_filename.endswith("sgm"))

                    if not (tf.gfile.Exists(lang1_filepath)
                            and tf.gfile.Exists(lang2_filepath)):
                        # For .tar.gz and .tgz files, we read compressed.
                        mode = "r:gz" if compressed_filepath.endswith(
                            "gz") else "r"
                        with tarfile.open(compressed_filepath,
                                          mode) as corpus_tar:
                            corpus_tar.extractall(tmp_dir)
                    if lang1_filepath.endswith(".gz"):
                        new_filepath = lang1_filepath.strip(".gz")
                        generator_utils.gunzip_file(lang1_filepath,
                                                    new_filepath)
                        lang1_filepath = new_filepath
                    if lang2_filepath.endswith(".gz"):
                        new_filepath = lang2_filepath.strip(".gz")
                        generator_utils.gunzip_file(lang2_filepath,
                                                    new_filepath)
                        lang2_filepath = new_filepath

                    for example in text_problems.text2text_txt_iterator(
                            lang1_filepath, lang2_filepath):
                        line1res = _preprocess_sgm(example["inputs"], is_sgm)
                        line2res = _preprocess_sgm(example["targets"], is_sgm)
                        if line1res and line2res:
                            lang1_resfile.write(line1res)
                            lang1_resfile.write("\n")
                            lang2_resfile.write(line2res)
                            lang2_resfile.write("\n")

    return filename
Exemple #16
0
def compile_data(tmp_dir, datasets, filename):
  """Concatenate all `datasets` and save to `filename`."""
  filename = os.path.join(tmp_dir, filename)
  lang1_fname = filename + ".lang1"
  lang2_fname = filename + ".lang2"
  if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
    tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname,
                    lang2_fname)
    return filename
  with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
    with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
      for dataset in datasets:
        url = dataset[0]
        compressed_filename = os.path.basename(url)
        compressed_filepath = os.path.join(tmp_dir, compressed_filename)
        if url.startswith("http"):
          generator_utils.maybe_download(tmp_dir, compressed_filename, url)

        if dataset[1][0] == "tsv":
          _, src_column, trg_column, glob_pattern = dataset[1]
          filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          if not filenames:
            # Capture *.tgz and *.tar.gz too.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
            filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          for tsv_filename in filenames:
            if tsv_filename.endswith(".gz"):
              new_filename = tsv_filename.strip(".gz")
              generator_utils.gunzip_file(tsv_filename, new_filename)
              tsv_filename = new_filename
            with tf.gfile.Open(tsv_filename) as tsv_file:
              for line in tsv_file:
                if line and "\t" in line:
                  parts = line.split("\t")
                  source, target = parts[src_column], parts[trg_column]
                  source, target = source.strip(), target.strip()
                  if source and target:
                    lang1_resfile.write(source)
                    lang1_resfile.write("\n")
                    lang2_resfile.write(target)
                    lang2_resfile.write("\n")
        else:
          lang1_filename, lang2_filename = dataset[1]
          lang1_filepath = os.path.join(tmp_dir, lang1_filename)
          lang2_filepath = os.path.join(tmp_dir, lang2_filename)
          is_sgm = (
              lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm"))

          if not (tf.gfile.Exists(lang1_filepath) and
                  tf.gfile.Exists(lang2_filepath)):
            # For .tar.gz and .tgz files, we read compressed.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
          if lang1_filepath.endswith(".gz"):
            new_filepath = lang1_filepath.strip(".gz")
            generator_utils.gunzip_file(lang1_filepath, new_filepath)
            lang1_filepath = new_filepath
          if lang2_filepath.endswith(".gz"):
            new_filepath = lang2_filepath.strip(".gz")
            generator_utils.gunzip_file(lang2_filepath, new_filepath)
            lang2_filepath = new_filepath

          for example in text_problems.text2text_txt_iterator(
              lang1_filepath, lang2_filepath):
            line1res = _preprocess_sgm(example["inputs"], is_sgm)
            line2res = _preprocess_sgm(example["targets"], is_sgm)
            if line1res and line2res:
              lang1_resfile.write(line1res)
              lang1_resfile.write("\n")
              lang2_resfile.write(line2res)
              lang2_resfile.write("\n")

  return filename
Exemple #17
0
def compile_data(tmp_dir, datasets, filename):
  """Concatenate all `datasets` and save to `filename`."""
  filename = os.path.join(tmp_dir, filename)
  lang1_fname = filename + ".lang1"
  lang2_fname = filename + ".lang2"
  if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
    tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname)
  with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
    with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
      for dataset in datasets: # 每个dataset一个下载网址
        url = dataset[0]
        compressed_filename = os.path.basename(url)
        compressed_filepath = os.path.join(tmp_dir, compressed_filename)
        if url.startswith("http"):
          generator_utils.maybe_download(tmp_dir, compressed_filename, url)

        if dataset[1][0] == "tsv": # 不知道神马东西,但是对我们也没有用,不用看
          _, src_column, trg_column, glob_pattern = dataset[1]
          filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          if not filenames:
            # Capture *.tgz and *.tar.gz too.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
            filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          for tsv_filename in filenames:
            if tsv_filename.endswith(".gz"):
              new_filename = tsv_filename.strip(".gz")
              generator_utils.gunzip_file(tsv_filename, new_filename)
              tsv_filename = new_filename
            with tf.gfile.Open(tsv_filename) as tsv_file:
              for line in tsv_file:
                if line and "\t" in line:
                  parts = line.split("\t")
                  source, target = parts[src_column], parts[trg_column]
                  source, target = source.strip(), target.strip()
                  if source and target:
                    lang1_resfile.write(source)
                    lang1_resfile.write("\n")
                    lang2_resfile.write(target)
                    lang2_resfile.write("\n")
        else: # 这个是用得到的逻辑
          lang1_filename, lang2_filename = dataset[1] # 获得两个语言的文件名,并拼接上路径
          lang1_filepath = os.path.join(tmp_dir, lang1_filename)
          lang2_filepath = os.path.join(tmp_dir, lang2_filename)
          is_sgm = (lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) # ???两个文件都以sgm结尾是啥意思

          if not (tf.gfile.Exists(lang1_filepath) and tf.gfile.Exists(lang2_filepath)): # 解压下载的文件
            # For .tar.gz and .tgz files, we read compressed.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
          if lang1_filepath.endswith(".gz"): # 如果解压后得到的小文件还是压缩文件,则再次解压
            new_filepath = lang1_filepath.strip(".gz")
            generator_utils.gunzip_file(lang1_filepath, new_filepath)
            lang1_filepath = new_filepath
          if lang2_filepath.endswith(".gz"): # 如果解压后得到的小文件还是压缩文件,则再次解压
            new_filepath = lang2_filepath.strip(".gz")
            generator_utils.gunzip_file(lang2_filepath, new_filepath)
            lang2_filepath = new_filepath

          # 每个example是一个字典,两个键值对,{"inputs": inputs, "targets": targets}
          for example in text_problems.text2text_txt_iterator(lang1_filepath, lang2_filepath): # 下面这几行代码的逻辑是删掉辅助的信息,只留下文本
            line1res = _preprocess_sgm(example["inputs"], is_sgm)
            line2res = _preprocess_sgm(example["targets"], is_sgm)
            if line1res and line2res:
              lang1_resfile.write(line1res)
              lang1_resfile.write("\n")
              lang2_resfile.write(line2res)
              lang2_resfile.write("\n")

  return filename