Exemple #1
0
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size, sources=None):
    """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
    vocab_filepath = os.path.join(tmp_dir, vocab_filename)
    if tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    #sources = sources or _DATA_FILE_URLS
    #sources = [["big.ch", "big.en"]]
    sources = [["wmt_ende_tok_train.lang1", "wmt_ende_tok_train.lang2"]]
    tf.logging.info("Generating vocab from: %s", str(sources))
    tokenizer = Tokenizer()
    for source in sources:
        #url = source[0]
        #filename = os.path.basename(url)
        #read_type = "r:gz" if "tgz" in filename else "r"

        #compressed_file = maybe_download(tmp_dir, filename, url)

        #with tarfile.open(compressed_file, read_type) as corpus_tar:
        #  corpus_tar.extractall(tmp_dir)

        #for lang_file in source[1]:
        for lang_file in source:

            tf.logging.info("Reading file: %s" % lang_file)
            filepath = os.path.join(tmp_dir, lang_file)

            # For some datasets a second extraction is necessary.
            #if ".gz" in lang_file:
            #  new_filepath = os.path.join(tmp_dir, lang_file[:-3])
            #  if tf.gfile.Exists(new_filepath):
            #    tf.logging.info("Subdirectory %s already exists, skipping unpacking"
            #                    % filepath)
            #  else:
            #    tf.logging.info("Unpacking subdirectory %s" % filepath)
            #    gunzip_file(filepath, new_filepath)
            #  filepath = new_filepath

            # Use Tokenizer to count the word occurrences.
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                file_byte_budget = 3.5e5 if "en" in filepath else 7e5
                for line in source_file:
                    if file_byte_budget <= 0:
                        break
                    line = line.strip()
                    file_byte_budget -= len(line)
                    _ = tokenizer.encode(text_encoder.native_to_unicode(line))

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, tokenizer.token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
  """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  if os.path.exists(vocab_filepath):
    vocab = SubwordTextEncoder(vocab_filepath)
    return vocab

  tokenizer = Tokenizer()
  for source in _DATA_FILE_URLS:
    url = source[0]
    filename = os.path.basename(url)
    read_type = "r:gz" if "tgz" in filename else "r"

    compressed_file = maybe_download(tmp_dir, filename, url)

    with tarfile.open(compressed_file, read_type) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    for lang_file in source[1]:
      tf.logging.info("Reading file: %s" % lang_file)
      filepath = os.path.join(tmp_dir, lang_file)

      # For some datasets a second extraction is necessary.
      if ".gz" in lang_file:
        new_filepath = os.path.join(tmp_dir, lang_file[:-3])
        if os.path.exists(new_filepath):
          tf.logging.info("Subdirectory %s already exists, skipping unpacking"
                          % filepath)
        else:
          tf.logging.info("Unpacking subdirectory %s" % filepath)
          gunzip_file(filepath, new_filepath)
        filepath = new_filepath

      # Use Tokenizer to count the word occurrences.
      with tf.gfile.GFile(filepath, mode="r") as source_file:
        file_byte_budget = 3.5e5 if "en" in filepath else 7e5
        for line in source_file:
          if file_byte_budget <= 0:
            break
          line = line.strip()
          file_byte_budget -= len(line)
          _ = tokenizer.encode(line)

  vocab = SubwordTextEncoder.build_to_target_size(
      vocab_size, tokenizer.token_counts, 1, 1e3)
  vocab.store_to_file(vocab_filepath)
  return vocab
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
  """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  if os.path.exists(vocab_filepath):
    vocab = SubwordTextEncoder(vocab_filepath)
    return vocab

  tokenizer = Tokenizer()
  for source in _DATA_FILE_URLS:
    url = source[0]
    filename = os.path.basename(url)
    read_type = "r:gz" if "tgz" in filename else "r"

    compressed_file = maybe_download(tmp_dir, filename, url)

    with tarfile.open(compressed_file, read_type) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    for lang_file in source[1]:
      tf.logging.info("Reading file: %s" % lang_file)
      filepath = os.path.join(tmp_dir, lang_file)

      # For some datasets a second extraction is necessary.
      if ".gz" in lang_file:
        tf.logging.info("Unpacking subdirectory %s" % filepath)
        new_filepath = os.path.join(tmp_dir, lang_file[:-3])
        gunzip_file(filepath, new_filepath)
        filepath = new_filepath

      # Use Tokenizer to count the word occurrences.
      with tf.gfile.GFile(filepath, mode="r") as source_file:
        file_byte_budget = 3.5e5 if "en" in filepath else 7e5
        for line in source_file:
          if file_byte_budget <= 0:
            break
          line = line.strip()
          file_byte_budget -= len(line)
          _ = tokenizer.encode(line)

  vocab = SubwordTextEncoder.build_to_target_size(
      vocab_size, tokenizer.token_counts, vocab_filepath, 1, 1e3)
  return vocab
Exemple #4
0
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
    """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS."""
    vocab_filepath = os.path.join(tmp_dir, vocab_filename)
    if os.path.exists(vocab_filepath):
        vocab = SubwordTextEncoder(vocab_filepath)
        return vocab

    tokenizer = Tokenizer()
    for source in _DATA_FILE_URLS:
        for lang_file in source[1]:
            tf.logging.info("Reading file: %s" % lang_file)
            filepath = os.path.join(tmp_dir, lang_file)

            # Use Tokenizer to count the word occurrences.
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                for line in source_file:
                    line = line.strip()
                    _ = tokenizer.encode(line)

    vocab = SubwordTextEncoder.build_to_target_size(vocab_size,
                                                    tokenizer.token_counts,
                                                    vocab_filepath, 1, 1e3)
    return vocab