def encode(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN): encoder_state = [] for layer_index, layer in enumerate(self.layers): input_depth = inputs.get_shape().as_list()[-1] if layer_index == 0: # For the first input, make the number of timesteps a multiple of the # total reduction factor. total_reduction_factor = pow(self.reduction_factor, len(self.layers) - 1) current_length = tf.shape(inputs)[1] factor = tf.divide(tf.cast(current_length, tf.float32), total_reduction_factor) new_length = tf.cast(tf.ceil(factor), tf.int32) * total_reduction_factor inputs = pad_in_time(inputs, new_length - current_length) # Lengths should not be smaller than the total reduction factor. sequence_length = tf.maximum(sequence_length, total_reduction_factor) else: # In other cases, reduce the time dimension. inputs = tf.reshape(inputs, [ tf.shape(inputs)[0], -1, input_depth * self.reduction_factor ]) if sequence_length is not None: sequence_length = tf.div(sequence_length, self.reduction_factor) with tf.variable_scope("layer_{}".format(layer_index)): outputs, state, sequence_length = layer.encode( inputs, sequence_length=sequence_length, mode=mode) encoder_state.append(state) inputs = outputs return (outputs, self.state_reducer.reduce(encoder_state), sequence_length)
def call(self, inputs, sequence_length=None, training=None): encoder_state = [] for layer_index, layer in enumerate(self.layers): input_depth = inputs.shape[-1] if layer_index == 0: # For the first input, make the number of timesteps a multiple of the # total reduction factor. total_reduction_factor = pow(self.reduction_factor, len(self.layers) - 1) current_length = tf.shape(inputs)[1] factor = tf.cast(current_length, tf.float32) / total_reduction_factor new_length = (tf.cast(tf.math.ceil(factor), tf.int32) * total_reduction_factor) inputs = pad_in_time(inputs, new_length - current_length) # Lengths should not be smaller than the total reduction factor. sequence_length = tf.maximum(sequence_length, total_reduction_factor) else: # In other cases, reduce the time dimension. inputs = tf.reshape( inputs, [ tf.shape(inputs)[0], -1, input_depth * self.reduction_factor ], ) if sequence_length is not None: sequence_length //= self.reduction_factor outputs, state, sequence_length = layer( inputs, sequence_length=sequence_length, training=training) encoder_state.append(state) inputs = outputs return (outputs, self.state_reducer(encoder_state), sequence_length)
from opennmt.layers.reducer import pad_in_time batch_size = tf.shape(src["length"])[0] all_encoder_outputs = [ src_encoder_auto, src_encoder_cross, tgt_encoder_auto, tgt_encoder_cross ] lang_ids = tf.concat( [tf.fill([batch_size * 2], 0), tf.fill([batch_size * 2], 1)], 0) max_time = tf.reduce_max( [tf.shape(output[0])[1] for output in all_encoder_outputs]) encodings = tf.concat([ pad_in_time(output[0], max_time - tf.shape(output[0])[1]) for output in all_encoder_outputs ], 0) sequence_lengths = tf.concat([output[2] for output in all_encoder_outputs], 0) with tf.variable_scope("discriminator"): l_d, l_adv = discriminator(encodings, sequence_lengths, lang_ids) # Step 7 lambda_auto = 1 lambda_cd = 1 lambda_adv = 1 l_auto = l_auto_src + l_auto_tgt l_cd = l_cd_src + l_cd_tgt
def train(src, tgt, src_trans, tgt_trans, src_vocab, tgt_vocab, src_emb=None, tgt_emb=None): # Step 1 from opennmt import constants from opennmt.utils.misc import count_lines def load_vocab(vocab_file): """Returns a lookup table and the vocabulary size.""" vocab_size = count_lines(vocab_file) + 1 # Add UNK. vocab = tf.contrib.lookup.index_table_from_file(vocab_file, vocab_size=vocab_size - 1, num_oov_buckets=1) return vocab, vocab_size def load_data(input_file, translated_file, input_vocab, translated_vocab, batch_size=32, max_seq_len=50, num_buckets=5): """Returns an iterator over the training data.""" def _make_dataset(text_file, vocab): dataset = tf.data.TextLineDataset(text_file) dataset = dataset.map( lambda x: tf.string_split([x]).values) # Split on spaces. dataset = dataset.map(vocab.lookup) # Lookup token in vocabulary. return dataset def _key_func(x): bucket_width = (max_seq_len + num_buckets - 1) // num_buckets bucket_id = x["length"] // bucket_width bucket_id = tf.minimum(bucket_id, num_buckets) return tf.to_int64(bucket_id) def _reduce_func(unused_key, dataset): return dataset.padded_batch( batch_size, { "ids": [None], "ids_in": [None], "ids_out": [None], "length": [], "trans_ids": [None], "trans_length": [] }) bos = tf.constant([constants.START_OF_SENTENCE_ID], dtype=tf.int64) eos = tf.constant([constants.END_OF_SENTENCE_ID], dtype=tf.int64) # Make a dataset from the input and translated file. input_dataset = _make_dataset(input_file, input_vocab) translated_dataset = _make_dataset(translated_file, translated_vocab) dataset = tf.data.Dataset.zip((input_dataset, translated_dataset)) dataset = dataset.shuffle(200000) # Define the input format. dataset = dataset.map( lambda x, y: { "ids": x, "ids_in": tf.concat([bos, x], axis=0), "ids_out": tf.concat([x, eos], axis=0), "length": tf.shape(x)[0], "trans_ids": y, "trans_length": tf.shape(y)[0] }) # Filter out invalid examples. dataset = dataset.filter(lambda x: tf.greater(x["length"], 0)) # Batch the dataset using a bucketing strategy. dataset = dataset.apply( tf.contrib.data.group_by_window(_key_func, _reduce_func, window_size=batch_size)) return dataset.make_initializable_iterator() src_vocab, src_vocab_size = load_vocab(src_vocab) tgt_vocab, tgt_vocab_size = load_vocab(tgt_vocab) with tf.device( "/cpu:0"): # Input pipeline should always be place on the CPU. src_iterator = load_data(src, src_trans, src_vocab, tgt_vocab) tgt_iterator = load_data(tgt, tgt_trans, tgt_vocab, src_vocab) src = src_iterator.get_next() tgt = tgt_iterator.get_next() # Step 2 def add_noise_python(words, dropout=0.1, k=3): """Applies the noise model in input words. Args: words: A numpy vector of word ids. dropout: The probability to drop words. k: Maximum distance of the permutation. Returns: A noisy numpy vector of word ids. """ def _drop_words(words, probability): """Drops words with the given probability.""" length = len(words) keep_prob = np.random.uniform(size=length) keep = np.random.uniform(size=length) > probability if np.count_nonzero(keep) == 0: ind = np.random.randint(0, length) keep[ind] = True words = np.take(words, keep.nonzero())[0] return words def _rand_perm_with_constraint(words, k): """Randomly permutes words ensuring that words are no more than k positions away from their original position.""" length = len(words) offset = np.random.uniform(size=length) * (k + 1) new_pos = np.arange(length) + offset return np.take(words, np.argsort(new_pos)) words = _drop_words(words, dropout) words = _rand_perm_with_constraint(words, k) return words def add_noise(ids, sequence_length): """Wraps add_noise_python for a batch of tensors.""" def _add_noise_single(ids, sequence_length): noisy_ids = add_noise_python(ids[:sequence_length]) noisy_sequence_length = len(noisy_ids) ids[:noisy_sequence_length] = noisy_ids ids[noisy_sequence_length:] = 0 return ids, np.int32(noisy_sequence_length) noisy_ids, noisy_sequence_length = tf.map_fn( lambda x: tf.py_func(_add_noise_single, x, [ids.dtype, tf.int32]), [ids, sequence_length], dtype=[ids.dtype, tf.int32], back_prop=False) noisy_ids.set_shape(ids.get_shape()) noisy_sequence_length.set_shape(sequence_length.get_shape()) return noisy_ids, noisy_sequence_length # Step 3 from opennmt.inputters.text_inputter import load_pretrained_embeddings def create_embeddings(vocab_size, depth=300): """Creates an embedding variable.""" return tf.get_variable("embedding", shape=[vocab_size, depth]) def load_embeddings(embedding_file, vocab_file): """Loads an embedding variable or embeddings file.""" try: embeddings = tf.get_variable("embedding") except ValueError: pretrained = load_pretrained_embeddings( embedding_file, vocab_file, num_oov_buckets=1, with_header=True, case_insensitive_embeddings=True) embeddings = tf.get_variable("embedding", shape=None, trainable=False, initializer=tf.constant( pretrained.astype(np.float32))) return embeddings with tf.variable_scope("src"): if src_emb is not None: src_emb = load_embeddings(src_emb, src_vocab) else: src_emb = create_embeddings(src_vocab_size) with tf.variable_scope("tgt"): if tgt_emb is not None: tgt_emb = load_embeddings(tgt_emb, tgt_vocab) else: tgt_emb = create_embeddings(tgt_vocab_size) # Step 4 hidden_size = 512 encoder = onmt.encoders.BidirectionalRNNEncoder(2, hidden_size) def add_noise_and_encode(ids, sequence_length, embedding, reuse=None): """Applies the noise model on ids, embeds and encodes. Args: ids: The tensor of words ids of shape [batch_size, max_time]. sequence_length: The tensor of sequence length of shape [batch_size]. embedding: The embedding variable. reuse: If True, reuse the encoder variables. Returns: A tuple (encoder output, encoder state, sequence length). """ noisy_ids, noisy_sequence_length = add_noise(ids, sequence_length) noisy = tf.nn.embedding_lookup(embedding, noisy_ids) with tf.variable_scope("encoder", reuse=reuse): return encoder.encode(noisy, sequence_length=noisy_sequence_length) src_encoder_auto = add_noise_and_encode(src["ids"], src["length"], src_emb, reuse=None) tgt_encoder_auto = add_noise_and_encode(tgt["ids"], tgt["length"], tgt_emb, reuse=True) src_encoder_cross = add_noise_and_encode(tgt["trans_ids"], tgt["trans_length"], src_emb, reuse=True) tgt_encoder_cross = add_noise_and_encode(src["trans_ids"], src["trans_length"], tgt_emb, reuse=True) # Step 5 decoder = onmt.decoders.AttentionalRNNDecoder( 2, hidden_size, bridge=onmt.layers.CopyBridge()) from opennmt.utils.losses import cross_entropy_sequence_loss def denoise(x, embedding, encoder_outputs, generator, reuse=None): """Denoises from the noisy encoding. Args: x: The input data from the dataset. embedding: The embedding variable. encoder_outputs: A tuple with the encoder outputs. generator: A tf.layers.Dense instance for projecting the logits. reuse: If True, reuse the decoder variables. Returns: The decoder loss. """ with tf.variable_scope("decoder", reuse=reuse): logits, _, _ = decoder.decode( tf.nn.embedding_lookup(embedding, x["ids_in"]), x["length"] + 1, initial_state=encoder_outputs[1], output_layer=generator, memory=encoder_outputs[0], memory_sequence_length=encoder_outputs[2]) cumulated_loss, _, normalizer = cross_entropy_sequence_loss( logits, x["ids_out"], x["length"] + 1) return cumulated_loss / normalizer with tf.variable_scope("src"): src_gen = tf.layers.Dense(src_vocab_size) src_gen.build([None, hidden_size]) with tf.variable_scope("tgt"): tgt_gen = tf.layers.Dense(tgt_vocab_size) tgt_gen.build([None, hidden_size]) l_auto_src = denoise(src, src_emb, src_encoder_auto, src_gen, reuse=None) l_auto_tgt = denoise(tgt, tgt_emb, tgt_encoder_auto, tgt_gen, reuse=True) l_cd_src = denoise(src, src_emb, tgt_encoder_cross, src_gen, reuse=True) l_cd_tgt = denoise(tgt, tgt_emb, src_encoder_cross, tgt_gen, reuse=True) # Step 6 def binary_cross_entropy(x, y, smoothing=0, epsilon=1e-12): """Computes the averaged binary cross entropy. bce = y*log(x) + (1-y)*log(1-x) Args: x: The predicted labels. y: The true labels. smoothing: The label smoothing coefficient. Returns: The cross entropy. """ y = tf.to_float(y) if smoothing > 0: smoothing *= 2 y = y * (1 - smoothing) + 0.5 * smoothing return -tf.reduce_mean( tf.log(x + epsilon) * y + tf.log(1.0 - x + epsilon) * (1 - y)) def discriminator(encodings, sequence_lengths, lang_ids, num_layers=3, hidden_size=1024, dropout=0.3): """Discriminates the encoder outputs against lang_ids. Args: encodings: The encoder outputs of shape [batch_size, max_time, hidden_size]. sequence_lengths: The length of each sequence of shape [batch_size]. lang_ids: The true lang id of each sequence of shape [batch_size]. num_layers: The number of layers of the discriminator. hidden_size: The hidden size of the discriminator. dropout: The dropout to apply on each discriminator layer output. Returns: A tuple with: the discriminator loss (L_d) and the adversarial loss (L_adv). """ x = encodings for _ in range(num_layers): x = tf.nn.dropout(x, 1.0 - dropout) x = tf.layers.dense(x, hidden_size, activation=tf.nn.leaky_relu) x = tf.nn.dropout(x, 1.0 - dropout) y = tf.layers.dense(x, 1) mask = tf.sequence_mask(sequence_lengths, maxlen=tf.shape(encodings)[1], dtype=tf.float32) mask = tf.expand_dims(mask, -1) y = tf.log_sigmoid(y) * mask y = tf.reduce_sum(y, axis=1) y = tf.exp(y) l_d = binary_cross_entropy(y, lang_ids, smoothing=0.1) l_adv = binary_cross_entropy(y, 1 - lang_ids) return l_d, l_adv from opennmt.layers.reducer import pad_in_time batch_size = tf.shape(src["length"])[0] all_encoder_outputs = [ src_encoder_auto, src_encoder_cross, tgt_encoder_auto, tgt_encoder_cross ] lang_ids = tf.concat( [tf.fill([batch_size * 2], 0), tf.fill([batch_size * 2], 1)], 0) max_time = tf.reduce_max( [tf.shape(output[0])[1] for output in all_encoder_outputs]) encodings = tf.concat([ pad_in_time(output[0], max_time - tf.shape(output[0])[1]) for output in all_encoder_outputs ], 0) sequence_lengths = tf.concat([output[2] for output in all_encoder_outputs], 0) with tf.variable_scope("discriminator"): l_d, l_adv = discriminator(encodings, sequence_lengths, lang_ids) # Step 7 lambda_auto = 1 lambda_cd = 1 lambda_adv = 1 l_auto = l_auto_src + l_auto_tgt l_cd = l_cd_src + l_cd_tgt l_final = (lambda_auto * l_auto + lambda_cd * l_cd + lambda_adv * l_adv) def build_train_op(global_step, encdec_variables, discri_variables): """Returns the training Op. When global_step % 2 == 0, it minimizes l_final and updates encdec_variables. Otherwise, it minimizes l_d and updates discri_variables. Args: global_step: The training step. encdec_variables: The list of variables of the encoder/decoder model. discri_variables: The list of variables of the discriminator. Returns: The training op. """ encdec_opt = tf.train.AdamOptimizer(learning_rate=0.0003, beta1=0.5) discri_opt = tf.train.RMSPropOptimizer(0.0005) encdec_gradients = encdec_opt.compute_gradients( l_final, var_list=encdec_variables) discri_gradients = discri_opt.compute_gradients( l_d, var_list=discri_variables) return tf.cond(tf.equal(tf.mod(global_step, 2), 0), true_fn=lambda: encdec_opt.apply_gradients( encdec_gradients, global_step=global_step), false_fn=lambda: discri_opt.apply_gradients( discri_gradients, global_step=global_step)) encdec_variables = [] discri_variables = [] for variable in tf.trainable_variables(): if variable.name.startswith("discriminator"): discri_variables.append(variable) else: encdec_variables.append(variable) global_step = tf.train.get_or_create_global_step() train_op = build_train_op(global_step, encdec_variables, discri_variables) # This example is too large for most GPUs. config = tf.ConfigProto(device_count={'GPU': 0}) i = 0 with tf.train.MonitoredTrainingSession(checkpoint_dir="model", config=config) as sess: sess.run([src_iterator.initializer, tgt_iterator.initializer]) while not sess.should_stop(): if i % 2 == 0: _, step, _l_auto, _l_cd, _l_adv, _l = sess.run( [train_op, global_step, l_auto, l_cd, l_adv, l_final]) print("{} - l_auto = {}; l_cd = {}, l_adv = {}; l = {}".format( step, _l_auto, _l_cd, _l_adv, _l)) else: _, step, _l_d = sess.run([train_op, global_step, l_d]) print("{} - l_d = {}".format(step, _l_d)) i += 1 sys.stdout.flush()
return l_d, l_adv from opennmt.layers.reducer import pad_in_time batch_size = tf.shape(src["length"])[0] all_encoder_outputs = [ src_encoder_auto, src_encoder_cross, tgt_encoder_auto, tgt_encoder_cross] lang_ids = tf.concat([ tf.fill([batch_size * 2], 0), tf.fill([batch_size * 2], 1)], 0) max_time = tf.reduce_max([tf.shape(output[0])[1] for output in all_encoder_outputs]) encodings = tf.concat([ pad_in_time(output[0], max_time - tf.shape(output[0])[1]) for output in all_encoder_outputs], 0) sequence_lengths = tf.concat([output[2] for output in all_encoder_outputs], 0) with tf.variable_scope("discriminator"): l_d, l_adv = discriminator(encodings, sequence_lengths, lang_ids) # Step 7 lambda_auto = 1 lambda_cd = 1 lambda_adv = 1 l_auto = l_auto_src + l_auto_tgt