def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size, sources=None): """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS).""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) if tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab #sources = sources or _DATA_FILE_URLS #sources = [["big.ch", "big.en"]] sources = [["wmt_ende_tok_train.lang1", "wmt_ende_tok_train.lang2"]] tf.logging.info("Generating vocab from: %s", str(sources)) tokenizer = Tokenizer() for source in sources: #url = source[0] #filename = os.path.basename(url) #read_type = "r:gz" if "tgz" in filename else "r" #compressed_file = maybe_download(tmp_dir, filename, url) #with tarfile.open(compressed_file, read_type) as corpus_tar: # corpus_tar.extractall(tmp_dir) #for lang_file in source[1]: for lang_file in source: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) # For some datasets a second extraction is necessary. #if ".gz" in lang_file: # new_filepath = os.path.join(tmp_dir, lang_file[:-3]) # if tf.gfile.Exists(new_filepath): # tf.logging.info("Subdirectory %s already exists, skipping unpacking" # % filepath) # else: # tf.logging.info("Unpacking subdirectory %s" % filepath) # gunzip_file(filepath, new_filepath) # filepath = new_filepath # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget = 3.5e5 if "en" in filepath else 7e5 for line in source_file: if file_byte_budget <= 0: break line = line.strip() file_byte_budget -= len(line) _ = tokenizer.encode(text_encoder.native_to_unicode(line)) vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, tokenizer.token_counts, 1, 1e3) vocab.store_to_file(vocab_filepath) return vocab
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS.""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) if os.path.exists(vocab_filepath): vocab = SubwordTextEncoder(vocab_filepath) return vocab tokenizer = Tokenizer() for source in _DATA_FILE_URLS: url = source[0] filename = os.path.basename(url) read_type = "r:gz" if "tgz" in filename else "r" compressed_file = maybe_download(tmp_dir, filename, url) with tarfile.open(compressed_file, read_type) as corpus_tar: corpus_tar.extractall(tmp_dir) for lang_file in source[1]: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) # For some datasets a second extraction is necessary. if ".gz" in lang_file: new_filepath = os.path.join(tmp_dir, lang_file[:-3]) if os.path.exists(new_filepath): tf.logging.info("Subdirectory %s already exists, skipping unpacking" % filepath) else: tf.logging.info("Unpacking subdirectory %s" % filepath) gunzip_file(filepath, new_filepath) filepath = new_filepath # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget = 3.5e5 if "en" in filepath else 7e5 for line in source_file: if file_byte_budget <= 0: break line = line.strip() file_byte_budget -= len(line) _ = tokenizer.encode(line) vocab = SubwordTextEncoder.build_to_target_size( vocab_size, tokenizer.token_counts, 1, 1e3) vocab.store_to_file(vocab_filepath) return vocab
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS.""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) if os.path.exists(vocab_filepath): vocab = SubwordTextEncoder(vocab_filepath) return vocab tokenizer = Tokenizer() for source in _DATA_FILE_URLS: url = source[0] filename = os.path.basename(url) read_type = "r:gz" if "tgz" in filename else "r" compressed_file = maybe_download(tmp_dir, filename, url) with tarfile.open(compressed_file, read_type) as corpus_tar: corpus_tar.extractall(tmp_dir) for lang_file in source[1]: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) # For some datasets a second extraction is necessary. if ".gz" in lang_file: tf.logging.info("Unpacking subdirectory %s" % filepath) new_filepath = os.path.join(tmp_dir, lang_file[:-3]) gunzip_file(filepath, new_filepath) filepath = new_filepath # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget = 3.5e5 if "en" in filepath else 7e5 for line in source_file: if file_byte_budget <= 0: break line = line.strip() file_byte_budget -= len(line) _ = tokenizer.encode(line) vocab = SubwordTextEncoder.build_to_target_size( vocab_size, tokenizer.token_counts, vocab_filepath, 1, 1e3) return vocab
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS.""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) if os.path.exists(vocab_filepath): vocab = SubwordTextEncoder(vocab_filepath) return vocab tokenizer = Tokenizer() for source in _DATA_FILE_URLS: for lang_file in source[1]: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: for line in source_file: line = line.strip() _ = tokenizer.encode(line) vocab = SubwordTextEncoder.build_to_target_size(vocab_size, tokenizer.token_counts, vocab_filepath, 1, 1e3) return vocab