def wmt_parsing_tokens(model_hparams, wrong_vocab_size): """English to parse tree translation benchmark. Args: model_hparams: a tf.contrib.training.HParams wrong_vocab_size: a number used in the filename indicating the approximate vocabulary size. This is not to be confused with the actual vocabulary size. Returns: a tf.contrib.training.HParams """ p = default_problem_hparams() # This vocab file must be present within the data directory. vocab_filename = os.path.join(model_hparams.data_dir, "tokens.vocab.%d" % wrong_vocab_size) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) p.input_modality = { "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size) } p.target_modality = modality.SymbolModality(model_hparams, subtokenizer.vocab_size) p.vocabulary = { "inputs": subtokenizer, "targets": subtokenizer, } p.input_space_id = 3 p.target_space_id = 15 return p
def test_problem_hparams(model_hparams, input_vocab_size, target_vocab_size): """Problem hparams for testing model bodies.""" p = default_problem_hparams() p.input_modality = { "inputs": modality.SymbolModality(model_hparams, input_vocab_size) } p.target_modality = modality.SymbolModality(model_hparams, target_vocab_size) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.TextEncoder() } return p
def wmt_parsing_characters(model_hparams): """English to parse tree translation benchmark.""" p = default_problem_hparams() p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)} p.target_modality = modality.SymbolModality(model_hparams, 256) p.vocabulary = { "inputs": text_encoder.ByteTextEncoder(), "targets": text_encoder.ByteTextEncoder(), } p.loss_multiplier = 2.0 p.input_space_id = 2 p.target_space_id = 14 return p
def algorithmic(vocab_size, model_hparams): """Default parameters for algorithmic tasks.""" p = default_problem_hparams() p.input_modality = { "inputs": modality.SymbolModality(model_hparams, vocab_size) } p.target_modality = modality.SymbolModality(model_hparams, vocab_size) p.vocabulary = { "inputs": text_encoder.TextEncoder(num_reserved_ids=1), "targets": text_encoder.TextEncoder(num_reserved_ids=1), } p.input_space_id = 10 p.target_space_id = 11 return p
def audio_timit_tokens(model_hparams, wrong_vocab_size): """English audio transcription benchmark. Args: model_hparams: a tf.contrib.training.HParams wrong_vocab_size: a number used in the filename indicating the approximate vocabulary size. This is not to be confused with the actual vocabulary size. Returns: a tf.contrib.training.HParams """ p = default_problem_hparams() # This vocab file must be present within the data directory. vocab_filename = os.path.join(model_hparams.data_dir, "tokens.vocab.%d" % wrong_vocab_size) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) p.input_modality = { "inputs": modality.AudioModality(model_hparams), } p.target_modality = modality.SymbolModality(model_hparams, subtokenizer.vocab_size) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": subtokenizer, } p.batch_size_multiplier = 256 p.loss_multiplier = 2.0 p.input_space_id = 13 p.target_space_id = 3 return p
def testSymbolModalityTargets(self): batch_size = 10 num_datashards = 5 length = 6 height = 7 hidden_size = 9 vocab_size = 11 model_hparams = tf.contrib.training.HParams( symbol_modality_num_shards=4, hidden_size=hidden_size, label_smoothing=0.2, shared_embedding_and_softmax_weights=0) body_output = -1 + np.random.random_integers( 100, size=(batch_size, length, height, hidden_size)) targets = -1 + np.random.random_integers( vocab_size, size=(batch_size, length, height, 1)) m = modality.SymbolModality(model_hparams, vocab_size) data_parallelism = expert_utils.Parallelism( ["/device:CPU:0"] * num_datashards, reuse=True) with self.test_session() as session: sharded_body_output = tf.split(tf.to_float(body_output), num_datashards) sharded_targets = tf.split(targets, num_datashards) sharded_logits, train_loss = m.targets_top_sharded( sharded_body_output, sharded_targets, data_parallelism) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res1, res2 = session.run((logits, train_loss)) self.assertEqual(res1.shape, (batch_size, length, height, 1, vocab_size)) self.assertEqual(res2.shape, ())
def wmt_ende_v2(model_hparams, vocab_size): """English to German translation benchmark with separate vocabularies.""" p = default_problem_hparams() # These vocab files must be present within the data directory. source_vocab_filename = os.path.join(model_hparams.data_dir, "wmt_ende_v2.en.vocab.%d" % vocab_size) target_vocab_filename = os.path.join(model_hparams.data_dir, "wmt_ende_v2.de.vocab.%d" % vocab_size) p.input_modality = { "inputs": modality.SymbolModality(model_hparams, vocab_size) } p.target_modality = modality.SymbolModality(model_hparams, vocab_size) p.vocabulary = { "inputs": text_encoder.SubwordTextEncoder(source_vocab_filename), "targets": text_encoder.SubwordTextEncoder(target_vocab_filename), } p.input_space_id = 3 p.target_space_id = 8 return p
def wmt_ende_tokens(model_hparams, wrong_vocab_size): """English to German translation benchmark.""" p = default_problem_hparams() # This vocab file must be present within the data directory. vocab_filename = os.path.join(model_hparams.data_dir, "tokens.vocab.%d" % wrong_vocab_size) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) p.input_modality = { "inputs": modality.SymbolModality(model_hparams, subtokenizer.vocab_size) } p.target_modality = modality.SymbolModality(model_hparams, subtokenizer.vocab_size) p.vocabulary = { "inputs": subtokenizer, "targets": subtokenizer, } p.input_space_id = 3 p.target_space_id = 8 return p
def image_mnist(model_hparams): """MNIST.""" p = default_problem_hparams() p.input_modality = {"inputs": modality.SymbolModality(model_hparams, 256)} p.target_modality = modality.ClassLabelModality(model_hparams, 10) p.batch_size_multiplier = 4 p.max_expected_batch_size_per_shard = 8 p.loss_multiplier = 3.0 p.input_space_id = 1 p.target_space_id = 1 return p
def lm1b_64k(model_hparams): """Billion-word language-modeling benchmark, 64k subtoken vocabulary.""" p = default_problem_hparams() p.perplexity_exponent = 1.067068 p.input_modality = {} p.target_modality = modality.SymbolModality(model_hparams, 65536) p.vocabulary = { "targets": text_encoder.SubwordTextEncoder( os.path.join(model_hparams.data_dir, "lm1b_64k.subword_text_encoder")) } p.target_space_id = 3 return p
def image_mscoco_characters(model_hparams): """COCO image captioning with captions as characters.""" p = default_problem_hparams() p.input_modality = {"inputs": modality.ImageModality(model_hparams)} p.target_modality = modality.SymbolModality(model_hparams, 256) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.ByteTextEncoder(), } p.batch_size_multiplier = 128 p.max_expected_batch_size_per_shard = 2 p.loss_multiplier = 2.0 p.input_space_id = 1 p.target_space_id = 2 return p
def audio_wsj_characters(model_hparams): """English audio transcription benchmark.""" p = default_problem_hparams() p.input_modality = { "inputs": modality.AudioSpectralModality(model_hparams), } p.target_modality = modality.SymbolModality(model_hparams, 256) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.ByteTextEncoder(), } p.batch_size_multiplier = 512 p.loss_multiplier = 2.0 p.input_space_id = 13 p.target_space_id = 2 return p
def image_mscoco_tokens(model_hparams, vocab_count): """COCO image captioning with captions as tokens.""" p = default_problem_hparams() p.input_modality = {"inputs": modality.ImageModality(model_hparams)} # This vocab file must be present within the data directory. vocab_filename = os.path.join(model_hparams.data_dir, "tokens.vocab.%d" % vocab_count) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) p.target_modality = modality.SymbolModality(model_hparams, subtokenizer.vocab_size) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": subtokenizer, } p.batch_size_multiplier = 256 p.max_expected_batch_size_per_shard = 2 p.input_space_id = 1 p.target_space_id = 3 return p
def wmt_ende_bpe32k(model_hparams): """English to German translation benchmark.""" p = default_problem_hparams() # single modality object enables embedding sharing between inputs and target # when model_hparams.shared_source_target_embedding is True. vocab_size = 40960 m = modality.SymbolModality(model_hparams, vocab_size) p.input_modality = {"inputs": m} p.target_modality = m # This vocab file must be present within the data directory. vocab_filename = os.path.join(model_hparams.data_dir, "vocab.bpe.32000") p.vocabulary = { "inputs": text_encoder.TokenTextEncoder(vocab_filename=vocab_filename), "targets": text_encoder.TokenTextEncoder(vocab_filename=vocab_filename), } p.loss_multiplier = 1.4 p.input_space_id = 4 p.target_space_id = 9 return p
def testSymbolModalityInputs(self): batch_size = 10 num_datashards = 5 length = 5 vocab_size = 5000 hidden_size = 9 model_hparams = tf.contrib.training.HParams( symbol_modality_num_shards=4, hidden_size=hidden_size, multiply_embedding_mode="sqrt_depth", shared_embedding_and_softmax_weights=0) x = -1 + np.random.random_integers(vocab_size, size=( batch_size, length, 1, 1)) m = modality.SymbolModality(model_hparams, vocab_size) data_parallelism = expert_utils.Parallelism( ["/device:CPU:0"] * num_datashards, reuse=True) with self.test_session() as session: xs = tf.split(x, num_datashards) sharded_output = m.inputs_bottom_sharded(xs, data_parallelism) output = tf.concat(sharded_output, 0) session.run(tf.global_variables_initializer()) res = session.run(output) self.assertEqual(res.shape, (batch_size, length, 1, hidden_size))