def _split_sentences(s1, s2): s1 = text_encoder.native_to_unicode(s1) s2 = text_encoder.native_to_unicode(s2) s1 = re.sub(r'(\w[A-Z]|[0-9a-z])([.!?]) ([A-Z])', r'\1\2__|__\3', s1) s2 = re.sub(r'([^0-9][.!?]) ([A-Z])', r'\1__|__\2', s2) s1_subsentences = s1.split('__|__') s2_subsentences = s2.split('__|__') return s1_subsentences, s2_subsentences
def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False): """Compute BLEU for two files (reference and hypothesis translation).""" ref_lines = text_encoder.native_to_unicode( tf.gfile.Open(ref_filename, "r").read()).splitlines() hyp_lines = text_encoder.native_to_unicode( tf.gfile.Open(hyp_filename, "r").read()).splitlines() assert len(ref_lines) == len(hyp_lines) if not case_sensitive: ref_lines = [x.lower() for x in ref_lines] hyp_lines = [x.lower() for x in hyp_lines] ref_tokens = [bleu_tokenize(x) for x in ref_lines] hyp_tokens = [bleu_tokenize(x) for x in hyp_lines] return compute_bleu(ref_tokens, hyp_tokens)
def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False): """Compute BLEU for two files (reference and hypothesis translation).""" ref_lines = text_encoder.native_to_unicode( tf.gfile.Open(ref_filename, "r").read()).splitlines() hyp_lines = text_encoder.native_to_unicode( tf.gfile.Open(hyp_filename, "r").read()).splitlines() assert len(ref_lines) == len(hyp_lines) if not case_sensitive: ref_lines = [x.lower() for x in ref_lines] hyp_lines = [x.lower() for x in hyp_lines] ref_tokens = [bleu_tokenize(x) for x in ref_lines] hyp_tokens = [bleu_tokenize(x) for x in hyp_lines] return compute_bleu(ref_tokens, hyp_tokens)
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): if self.vocab_type != VocabType.SUBWORD: raise ValueError('Unsupported VocabType: %s' % self.vocab_type) vocab_filepath = os.path.join(data_dir, self.vocab_filename) if force_get or tf.gfile.Exists(vocab_filepath): tf.logging.info('Found vocab file: %s', vocab_filepath) return ModernMTSubwordTextEncoder(vocab_filepath) # Vocabulary file does not exist: generate vocabulary # -------------------------------------------------------------------------------------------------------------- # Load token counts file if present (or generate if missing) tokens_filepath = os.path.join(tmp_dir, 'token_counts.dict') if tf.gfile.Exists(tokens_filepath): tf.logging.info('Found token counts file: %s', tokens_filepath) token_counts = self._load_token_counts(tokens_filepath) else: tf.logging.info('Generating token counts file: %s', tokens_filepath) token_counts = defaultdict(int) for item in self.generate_text_for_vocab(data_dir, tmp_dir): for tok in text_encoder.native_to_unicode(item).split(u' '): token_counts[tok] += 1 self._save_token_counts(token_counts, tokens_filepath) # Build subword builder = SubwordTextEncoderBuilder( self.approx_vocab_size, custom_tokens=self._make_reserved_tokens()) return builder.build(token_counts, vocab_filepath)
def configure_job(): """Construct jobSpec for ML Engine job.""" # See documentation: # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput training_input = { "pythonModule": "tensor2tensor.bin.t2t_trainer", "args": flags_as_args(), "region": text_encoder.native_to_unicode(cloud.default_region()), "runtimeVersion": RUNTIME_VERSION, "pythonVersion": "3.5" if sys.version_info.major == 3 else "2.7", "jobDir": FLAGS.output_dir, "scaleTier": "CUSTOM", "masterType": FLAGS.cloud_mlengine_master_type or get_default_master_type( num_gpus=FLAGS.worker_gpu) } if FLAGS.use_tpu: training_input["masterType"] = (FLAGS.cloud_mlengine_master_type or "standard") training_input["workerType"] = "cloud_tpu" training_input["workerCount"] = 1 if FLAGS.hparams_range: tf.logging.info("Configuring hyperparameter tuning.") training_input["hyperparameters"] = configure_autotune( FLAGS.hparams_range, FLAGS.autotune_objective, FLAGS.autotune_maximize, FLAGS.autotune_max_trials, FLAGS.autotune_parallel_trials, ) timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") job_name = "%s_%s_t2t_%s" % (FLAGS.model, FLAGS.problem, timestamp) job_spec = {"jobId": job_name, "trainingInput": training_input} return job_spec
def _get_or_build_subword_text_encoder(tmp_dir): """Builds a SubwordTextEncoder based on the corpus. Args: tmp_dir: directory containing dataset. Returns: a SubwordTextEncoder. """ filepath = os.path.join(tmp_dir, "lm1b_32k.subword_text_encoder") if tf.gfile.Exists(filepath): return text_encoder.SubwordTextEncoder(filepath) _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) token_counts = defaultdict(int) line_count = 0 max_lines = 63000 for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]): tokens = tokenizer.encode( _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) for tok in tokens: token_counts[tok] += 1 line_count += 1 if line_count >= max_lines: break ret = text_encoder.SubwordTextEncoder() ret.build_from_token_counts(token_counts, min_count=5) ret.store_to_file(filepath) return ret
def generator(tmp_dir, train, characters=False): """Generator for lm1b sentences. Args: tmp_dir: a string. train: a boolean. characters: a boolean Yields: A dictionary {"inputs": [0], "targets": [<subword ids>]} """ _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) files = (_train_data_filenames(tmp_dir) if train else [_dev_data_filename(tmp_dir)]) if characters: encoder = text_encoder.ByteTextEncoder() else: encoder = _get_or_build_subword_text_encoder(tmp_dir) for filepath in files: tf.logging.info("filepath = %s", filepath) for line in tf.gfile.Open(filepath): tokens = encoder.encode( _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) tokens.append(EOS) yield {"inputs": [0], "targets": tokens}
def rank_reference_paragraphs(wiki_title, references_content, normalize=True): """Rank and return reference paragraphs by tf-idf score on title tokens.""" normalized_title = _normalize_text(wiki_title) title_tokens = _tokens_to_score( set(tokenizer.encode(text_encoder.native_to_unicode(normalized_title)))) ref_paragraph_info = [] doc_counts = collections.defaultdict(int) for ref in references_content: for paragraph in ref.split("\n"): normalized_paragraph = _normalize_text(paragraph) if cc_utils.filter_paragraph(normalized_paragraph): # Skip paragraph continue counts = _token_counts(normalized_paragraph, title_tokens) for token in title_tokens: if counts[token]: doc_counts[token] += 1 content = normalized_paragraph if normalize else paragraph info = {"content": content, "counts": counts} ref_paragraph_info.append(info) for info in ref_paragraph_info: score = 0. for token in title_tokens: term_frequency = info["counts"][token] inv_doc_frequency = ( float(len(ref_paragraph_info)) / max(doc_counts[token], 1)) score += term_frequency * math.log(inv_doc_frequency) info["score"] = score ref_paragraph_info.sort(key=lambda el: el["score"], reverse=True) return [info["content"] for info in ref_paragraph_info]
def _rank_reference_paragraphs(wiki_title, references_content): """Rank and return reference paragraphs by tf-idf score on title tokens.""" title_tokens = _tokens_to_score( set(tokenizer.encode(text_encoder.native_to_unicode(wiki_title)))) ref_paragraph_info = [] doc_counts = collections.defaultdict(int) for ref in references_content: for paragraph in ref.split("\n"): paragraph = _normalize_text(paragraph) if cc_utils.filter_paragraph(paragraph): # Skip paragraph continue counts = _token_counts(paragraph, title_tokens) for token in title_tokens: if counts[token]: doc_counts[token] += 1 info = {"content": paragraph, "counts": counts} ref_paragraph_info.append(info) for info in ref_paragraph_info: score = 0. for token in title_tokens: term_frequency = info["counts"][token] inv_doc_frequency = (float(len(ref_paragraph_info)) / max(doc_counts[token], 1)) score += term_frequency * math.log(inv_doc_frequency) info["score"] = score ref_paragraph_info.sort(key=lambda el: el["score"], reverse=True) return [info["content"] for info in ref_paragraph_info]
def configure_job(): """Construct jobSpec for ML Engine job.""" # See documentation: # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput training_input = { "pythonModule": "tensor2tensor.bin.t2t_trainer", "args": flags_as_args(), "region": text_encoder.native_to_unicode(default_region()), "runtimeVersion": RUNTIME_VERSION, "pythonVersion": "3.5" if sys.version_info.major == 3 else "2.7", "jobDir": FLAGS.output_dir, "scaleTier": "CUSTOM", "masterType": FLAGS.cloud_mlengine_master_type or get_default_master_type( num_gpus=FLAGS.worker_gpu) } if FLAGS.use_tpu: training_input["masterType"] = (FLAGS.cloud_mlengine_master_type or "standard") training_input["workerType"] = "cloud_tpu" training_input["workerCount"] = 1 if FLAGS.hparams_range: tf.logging.info("Configuring hyperparameter tuning.") training_input["hyperparameters"] = configure_autotune( FLAGS.hparams_range, FLAGS.autotune_objective, FLAGS.autotune_maximize, FLAGS.autotune_max_trials, FLAGS.autotune_parallel_trials, ) timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") job_name = "%s_%s_t2t_%s" % (FLAGS.model, FLAGS.problem, timestamp) job_spec = {"jobId": job_name, "trainingInput": training_input} return job_spec
def get_or_generate_vocab_es(tmp_dir, vocab_filename, vocab_size, datasets): """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS).""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) print(vocab_filepath) if tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab sources = datasets tf.logging.info("Generating vocab from: %s", str(sources)) token_counts = defaultdict(int) for source in sources: for lang_file in source[0]: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) print(filepath) # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget = 3.5e5 if "en" in filepath else 7e5 for line in source_file: if file_byte_budget <= 0: break line = line.strip() file_byte_budget -= len(line) for tok in tokenizer.encode(text_encoder.native_to_unicode(line)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) vocab.store_to_file(vocab_filepath) return vocab
def generator(self, data_dir, tmp_dir, is_training): """Generator for lm1b sentences. Args: data_dir: data dir. tmp_dir: tmp dir. is_training: a boolean. Yields: A dictionary {"inputs": [0], "targets": [<subword ids>]} """ _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) files = (_train_data_filenames(tmp_dir) if is_training else [_dev_data_filename(tmp_dir)]) if self.is_character_level: encoder = text_encoder.ByteTextEncoder() else: vocab_filepath = os.path.join(data_dir, self.vocab_file) encoder = _get_or_build_subword_text_encoder( tmp_dir, vocab_filepath, self.targeted_vocab_size) for filepath in files: tf.logging.info("filepath = %s", filepath) for line in tf.gfile.Open(filepath): tokens = encoder.encode( _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) tokens.append(EOS) yield {"inputs": [0], "targets": tokens}
def generator(self, data_dir, tmp_dir, is_training): """Generator for lm1b sentences. Args: data_dir: data dir. tmp_dir: tmp dir. is_training: a boolean. Yields: A dictionary {"inputs": [0], "targets": [<subword ids>]} """ _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) files = (_train_data_filenames(tmp_dir) if is_training else [_dev_data_filename(tmp_dir)]) if self.is_character_level: encoder = text_encoder.ByteTextEncoder() else: vocab_filepath = os.path.join(data_dir, self.vocab_file) encoder = _get_or_build_subword_text_encoder( tmp_dir, vocab_filepath, self.targeted_vocab_size) for filepath in files: tf.logging.info("filepath = %s", filepath) for line in tf.gfile.Open(filepath): tokens = encoder.encode( _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) tokens.append(EOS) yield {"inputs": [0], "targets": tokens}
def _token_counts(text, token_set=None): counts = collections.defaultdict(int) for token in tokenizer.encode(text_encoder.native_to_unicode(text)): if token_set and token not in token_set: continue counts[token] += 1 return counts
def _token_counts(text, token_set=None): counts = collections.defaultdict(int) for token in tokenizer.encode(text_encoder.native_to_unicode(text)): if token_set and token not in token_set: continue counts[token] += 1 return counts
def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath, target_size): """Builds a SubwordTextEncoder based on the corpus. Args: tmp_dir: directory containing dataset. vocab_filepath: path to store (or load) vocab. target_size: an optional integer. Returns: a SubwordTextEncoder. """ if tf.gfile.Exists(vocab_filepath): return text_encoder.SubwordTextEncoder(vocab_filepath) _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) token_counts = defaultdict(int) line_count = 0 max_lines = 63000 for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]): tokens = tokenizer.encode( _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) for tok in tokens: token_counts[tok] += 1 line_count += 1 if line_count >= max_lines: break if target_size == 2**15: # legacy behavior ret = text_encoder.SubwordTextEncoder() ret.build_from_token_counts(token_counts, min_count=5) else: ret = text_encoder.SubwordTextEncoder.build_to_target_size( target_size, token_counts, 1, 1000) ret.store_to_file(vocab_filepath) return ret
def encode_with_indexes(self, raw_text): tokens = text_encoder.native_to_unicode(raw_text).split(u' ') subtokens = self._tokens_to_subtoken_strings(tokens) subtoken_ids = [ self._subtoken_string_to_id[subtoken] for subtoken in subtokens ] return subtoken_ids, self._get_indexes(subtokens)
def generate_bpe_vocab(file_list, targeted_vocab_size): token_counts = defaultdict(int) for item in generator_fn(file_list): for tok in tokenizer.encode(text_encoder.native_to_unicode(item)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( targeted_vocab_size, token_counts, 1, 1e3) return vocab
def get_or_generate_vocab(data_dir, tmp_dir, vocab_filename, vocab_size, sources=None): """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS).""" vocab_filepath = os.path.join(data_dir, vocab_filename) if tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab sources = sources or _DATA_FILE_URLS tf.logging.info("Generating vocab from: %s", str(sources)) token_counts = defaultdict(int) for source in sources: url = source[0] filename = os.path.basename(url) read_type = "r:gz" if "tgz" in filename else "r" compressed_file = maybe_download(tmp_dir, filename, url) with tarfile.open(compressed_file, read_type) as corpus_tar: corpus_tar.extractall(tmp_dir) for lang_file in source[1]: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) # For some datasets a second extraction is necessary. if ".gz" in lang_file: new_filepath = os.path.join(tmp_dir, lang_file[:-3]) if tf.gfile.Exists(new_filepath): tf.logging.info( "Subdirectory %s already exists, skipping unpacking" % filepath) else: tf.logging.info("Unpacking subdirectory %s" % filepath) gunzip_file(filepath, new_filepath) filepath = new_filepath # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget = 3.5e5 if "en" in filepath else 7e5 for line in source_file: if file_byte_budget <= 0: break line = line.strip() file_byte_budget -= len(line) for tok in tokenizer.encode( text_encoder.native_to_unicode(line)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) vocab.store_to_file(vocab_filepath) return vocab
def launch_job(job_spec): """Launch job on ML Engine.""" project_id = "projects/{}".format( text_encoder.native_to_unicode(default_project())) credentials = GoogleCredentials.get_application_default() cloudml = discovery.build("ml", "v1", credentials=credentials, cache_discovery=False) request = cloudml.projects().jobs().create(body=job_spec, parent=project_id) request.execute()
def launch_job(job_spec): """Launch job on ML Engine.""" project_id = "projects/{}".format( text_encoder.native_to_unicode(cloud.default_project())) credentials = GoogleCredentials.get_application_default() cloudml = discovery.build("ml", "v1", credentials=credentials, cache_discovery=False) request = cloudml.projects().jobs().create(body=job_spec, parent=project_id) request.execute()
def _load_token_counts(filepath): token_counts = {} with tf.gfile.GFile(filepath, mode='rb') as tokens_file: for line in tokens_file: line = text_encoder.native_to_unicode(line.rstrip('\n')) count, token = line.split(u' ', 1) token_counts[token] = int(count) return token_counts
def _normalize_string(raw_str): """Normalizes the string using tokenizer.encode. Args: raw_str: the input string Returns: A string which is ready to be tokenized using split() """ return ' '.join( token.strip() for token in tokenizer.encode(text_encoder.native_to_unicode(raw_str)))
def _normalize_string(raw_str): """Normalizes the string using tokenizer.encode. Args: raw_str: the input string Returns: A string which is ready to be tokenized using split() """ return " ".join( token.strip() for token in tokenizer.encode(text_encoder.native_to_unicode(raw_str)))
def generate_samples(self, data_dir, tmp_dir, dataset_split): del data_dir split_files = { problem.DatasetSplit.TRAIN: _train_data_filenames(tmp_dir), problem.DatasetSplit.EVAL: _dev_data_filenames(tmp_dir), } _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) files = split_files[dataset_split] for filepath in files: tf.logging.info("filepath = %s", filepath) for line in tf.gfile.Open(filepath): txt = _replace_oov(original_vocab, text_encoder.native_to_unicode(line)) yield {"targets": txt}
def generate_samples(self, data_dir, tmp_dir, dataset_split): del data_dir split_files = { problem.DatasetSplit.TRAIN: _train_data_filenames(tmp_dir), problem.DatasetSplit.EVAL: _dev_data_filenames(tmp_dir), } _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) files = split_files[dataset_split] for filepath in files: tf.logging.info("filepath = %s", filepath) for line in tf.gfile.Open(filepath): txt = _replace_oov(original_vocab, text_encoder.native_to_unicode(line)) yield {"targets": txt}
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size, generator_fn): """Inner implementation for vocab generators.""" vocab_filepath = os.path.join(data_dir, vocab_filename) if tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab token_counts = defaultdict(int) for item in generator_fn(): for tok in tokenizer.encode(text_encoder.native_to_unicode(item)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) vocab.store_to_file(vocab_filepath) return vocab
def generate_bpe_vocab(file_list, targeted_vocab_size): def generator_fn(): for filepath in file_list: with tf.gfile.GFile(filepath, mode="r") as source_file: #file_byte_budget = 3.5e5 if filepath.endswith("en") else 7e5 for line in source_file: #if file_byte_budget <= 0: # break line = line.strip() #file_byte_budget -= len(line) yield line token_counts = defaultdict(int) for item in generator_fn(): for tok in tokenizer.encode(text_encoder.native_to_unicode(item)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( targeted_vocab_size, token_counts, 1, 1e3) return vocab
def _original_vocab(tmp_dir): """Returns a set containing the original vocabulary. This is important for comparing with published results. Args: tmp_dir: directory containing dataset. Returns: a set of strings """ vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/" "vocab-2016-09-10.txt") vocab_filename = os.path.basename(vocab_url) vocab_filepath = os.path.join(tmp_dir, vocab_filename) if not os.path.exists(vocab_filepath): generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url) return set( [text_encoder.native_to_unicode(l.strip()) for l in tf.gfile.Open(vocab_filepath)])
def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename, index, vocab_filename, vocab_size): r"""Generate a vocabulary from a tabbed source file. The source is a file of source, target pairs, where each line contains a source string and a target string, separated by a tab ('\t') character. The index parameter specifies 0 for the source or 1 for the target. Args: data_dir: path to the data directory. tmp_dir: path to the temporary directory. source_filename: the name of the tab-separated source file. index: index. vocab_filename: the name of the vocabulary file. vocab_size: vocabulary size. Returns: The vocabulary. """ vocab_filepath = os.path.join(data_dir, vocab_filename) if os.path.exists(vocab_filepath): vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab # Use Tokenizer to count the word occurrences. token_counts = defaultdict(int) filepath = os.path.join(tmp_dir, source_filename) with tf.gfile.GFile(filepath, mode="r") as source_file: for line in source_file: line = line.strip() if line and "\t" in line: parts = line.split("\t", maxsplit=1) part = parts[index].strip() for tok in tokenizer.encode( text_encoder.native_to_unicode(part)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) vocab.store_to_file(vocab_filepath) return vocab
def _original_vocab(tmp_dir): """Returns a set containing the original vocabulary. This is important for comparing with published results. Args: tmp_dir: directory containing dataset. Returns: a set of strings """ vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/" "vocab-2016-09-10.txt") vocab_filename = os.path.basename(vocab_url + ".en") vocab_filepath = os.path.join(tmp_dir, vocab_filename) if not os.path.exists(vocab_filepath): generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url) return set([ text_encoder.native_to_unicode(l.strip()) for l in tf.gfile.Open(vocab_filepath) ])
def tar_and_copy_t2t(train_dir): """Tar Tensor2Tensor and cp to train_dir.""" tf.logging.info("Tarring and pushing local Tensor2Tensor package.") output = text_encoder.native_to_unicode(shell_output( "pip show tensor2tensor")).split("\n") assert output[1].startswith("Version") assert output[7].startswith("Location") t2t_version = output[1].split(":")[1].strip() t2t_dir = output[7].split(":")[1].strip() # A local installation cloned from GitHub will have a setup.py file and a docs # folder is_local_t2t = all([ tf.gfile.Exists(os.path.join(t2t_dir, fname)) for fname in ["setup.py", "docs/cloud_mlengine.md"] ]) if is_local_t2t: tf.logging.info("Found local T2T installation. Tarring directory %s", t2t_dir) else: # PyPI installation # Create a folder with just a setup.py file pointing to the right version tf.logging.info("Found PyPI T2T installation. Launching tensor2tensor==%s", t2t_version) t2t_dir = os.path.join(tempfile.gettempdir(), "tensor2tensor_tmp") shutil.rmtree(t2t_dir, ignore_errors=True) os.mkdir(t2t_dir) setup_fname = os.path.join(t2t_dir, "setup.py") setup_file_str = get_setup_file( name="DummyT2TPackage", packages=["tensor2tensor==%s" % t2t_version] ) with tf.gfile.Open(setup_fname, "w") as f: f.write(setup_file_str) t2t_tar = _tar_and_copy(t2t_dir, train_dir) return t2t_tar
def tar_and_copy_t2t(train_dir): """Tar Tensor2Tensor and cp to train_dir.""" tf.logging.info("Tarring and pushing local Tensor2Tensor package.") output = text_encoder.native_to_unicode( cloud.shell_output("pip show tensor2tensor")).split("\n") assert output[1].startswith("Version") assert output[7].startswith("Location") t2t_version = output[1].split(":")[1].strip() t2t_dir = output[7].split(":")[1].strip() # A local installation cloned from GitHub will have a setup.py file and a docs # folder is_local_t2t = all([ tf.gfile.Exists(os.path.join(t2t_dir, fname)) for fname in ["setup.py", "docs/cloud_mlengine.md"] ]) if is_local_t2t: tf.logging.info("Found local T2T installation. Tarring directory %s", t2t_dir) else: # PyPI installation # Create a folder with just a setup.py file pointing to the right version tf.logging.info( "Found PyPI T2T installation. Launching tensor2tensor==%s", t2t_version) t2t_dir = os.path.join(tempfile.gettempdir(), "tensor2tensor_tmp") shutil.rmtree(t2t_dir, ignore_errors=True) os.mkdir(t2t_dir) setup_fname = os.path.join(t2t_dir, "setup.py") setup_file_str = get_setup_file( name="DummyT2TPackage", packages=["tensor2tensor==%s" % t2t_version]) with tf.gfile.Open(setup_fname, "w") as f: f.write(setup_file_str) t2t_tar = _tar_and_copy(t2t_dir, train_dir) return t2t_tar
def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath, target_size): """Builds a SubwordTextEncoder based on the corpus. Args: tmp_dir: directory containing dataset. vocab_filepath: path to store (or load) vocab. target_size: an optional integer. Returns: a SubwordTextEncoder. """ if tf.gfile.Exists(vocab_filepath): return text_encoder.SubwordTextEncoder(vocab_filepath) _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) token_counts = defaultdict(int) line_count = 0 max_lines = 63000 for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]): tokens = tokenizer.encode( _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) for tok in tokens: token_counts[tok] += 1 line_count += 1 if line_count >= max_lines: break if target_size == 2 ** 15: # legacy behavior ret = text_encoder.SubwordTextEncoder() ret.build_from_token_counts(token_counts, min_count=5) else: ret = text_encoder.SubwordTextEncoder.build_to_target_size( target_size, token_counts, 1, 1000) ret.store_to_file(vocab_filepath) return ret
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size, generator): """Inner implementation for vocab generators. Args: data_dir: The base directory where data and vocab files are stored. If None, then do not save the vocab even if it doesn't exist. vocab_filename: relative filename where vocab file is stored vocab_size: target size of the vocabulary constructed by SubwordTextEncoder generator: a generator that produces tokens from the vocabulary Returns: A SubwordTextEncoder vocabulary object. """ if data_dir is None: vocab_filepath = None else: vocab_filepath = os.path.join(data_dir, vocab_filename) if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab tf.logging.info("Generating vocab file: %s", vocab_filepath) token_counts = defaultdict(int) for item in generator: for tok in tokenizer.encode(text_encoder.native_to_unicode(item)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) if vocab_filepath is not None: vocab.store_to_file(vocab_filepath) return vocab
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size, generator): """Inner implementation for vocab generators. Args: data_dir: The base directory where data and vocab files are stored. If None, then do not save the vocab even if it doesn't exist. vocab_filename: relative filename where vocab file is stored vocab_size: target size of the vocabulary constructed by SubwordTextEncoder generator: a generator that produces tokens from the vocabulary Returns: A SubwordTextEncoder vocabulary object. """ if data_dir is None: vocab_filepath = None else: vocab_filepath = os.path.join(data_dir, vocab_filename) if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab tf.logging.info("Generating vocab file: %s", vocab_filepath) token_counts = defaultdict(int) for item in generator: for tok in tokenizer.encode(text_encoder.native_to_unicode(item)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) if vocab_filepath is not None: vocab.store_to_file(vocab_filepath) return vocab
def test_native_to_unicode(self): s = r"foo bar" self.assertIsInstance(text_encoder.native_to_unicode(s), unicode) self.assertEqual(text_encoder.native_to_unicode(s), u"foo bar")
def encode(self, raw_text): return self._tokens_to_subtoken_ids( text_encoder.native_to_unicode(raw_text).split(u' '))
def get_or_generate_vocab(data_dir, tmp_dir, vocab_filename, vocab_size, text_files, mode='subword'): """Implementation for vocab generators. Args: data_dir: The base directory where data and vocab files are stored. If None, then do not save the vocab even if it doesn't exist. ... vocab_filename: relative filename where vocab file is stored vocab_size: None is accepted. target size of the vocabulary constructed by TextEncoder ... Returns: A TokenTextEncoder vocabulary object. """ def generate(): tf.logging.info("Generating vocab from: %s", str(text_files)) for lang_file in text_files: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) with tf.gfile.GFile(filepath, mode="r") as source_file: for line in source_file: line = line.strip() yield line def encode(text): if mode == 'character': return list(text) else: return tokenizer.encode(text) def encoder(vocab_filepath): if mode == 'character': return CharacterTextEncoder(vocab_filepath, replace_oov="UNK") else: return text_encoder.SubwordTextEncoder(vocab_filepath) def build_and_save_vocab(vocab_filepath, vocab_size, token_counts): if mode == 'character': with tf.gfile.GFile(vocab_filepath, mode="w") as f: word_list = list( map(lambda x: x[0], token_counts.most_common())) word_list = ['UNK'] + word_list if vocab_size is not None: word_list = word_list[:vocab_size] for word in word_list: f.write(word + '\n') else: text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3).store_to_file(vocab_filepath) if data_dir is None: vocab_filepath = None else: vocab_filepath = os.path.join(data_dir, vocab_filename) if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = encoder(vocab_filepath) return vocab tf.logging.info("Generating vocab file: %s", vocab_filepath) token_counts = Counter() for item in generate(): for tok in encode(text_encoder.native_to_unicode(item)): token_counts[tok] += 1 build_and_save_vocab(vocab_filepath, vocab_size, token_counts) vocab = encoder(vocab_filepath) return vocab
def test_native_to_unicode(self): s = r"foo bar" s_unicode = text_encoder.native_to_unicode(s) if six.PY2: self.assertIsInstance(s_unicode, unicode) self.assertEqual(s_unicode, u"foo bar")
def test_native_to_unicode(self): s = r"foo bar" s_unicode = text_encoder.native_to_unicode(s) if six.PY2: self.assertIsInstance(s_unicode, unicode) self.assertEqual(s_unicode, u"foo bar")