def one_iteration(self,
                      source, source_length,
                      target, target_length,
                      start_tokens,
                      optimizer,
                      gpu_index=0):
        with tf.device("/gpu:%d" % gpu_index):
            embedding = self.create_embedding()
            if self.output_layer:
                output_layer = self.create_output_layer()
            else:
                output_layer = None
            
            # Encoder
            (encoder_output, encoder_lengths, encoder_final_state) = self.encode(
                source, source_length, embedding)
            
            # Decoder cell & initial state
            (decoder_cell, decoder_initial_state) = self.get_decoder_cell_and_initial_state(
                encoder_output, encoder_lengths, encoder_final_state)
            
            (loss_ML, num_tokens, total_loss) = self.decode_train(
                tf.concat(
                    [tf.expand_dims(start_tokens, axis=1), target], axis=1),
                target_length + 1, 
                embedding,
                decoder_cell, decoder_initial_state, 
                output_layer=output_layer
            )
            
            # Get trainable variables
            # (up to now we already have all the seq2seq trainable vars)
            if self.trainable_variables == []:
                self.trainable_variables = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope="seq2seq")

            # Compute tower gradients
            grads = compute_grads(loss_ML, optimizer, self.trainable_variables)

            # Decoder -- beam (for inference)
            (sample_ids_beam, final_lengths_beam) = self.decode_beam(
                embedding, decoder_cell, decoder_initial_state, start_tokens, 
                output_layer=output_layer)
            
        return (num_tokens, total_loss, grads,
                sample_ids_beam, final_lengths_beam)
Example #2
0
def build_seq2seq(input_seqs, target_seqs, filtered_target_seqs,
                  input_seq_lengths, target_seq_lengths, is_training):

    with tf.variable_scope("seq2seq"):
        with tf.device('/cpu:0'):
            reuse = False

            if get_PPL:
                keep_prob = tf.convert_to_tensor(1.0)
            else:
                keep_prob = get_keep_prob(dropout_rate, is_training)

            sequence_mask = get_sequence_mask(target_seq_lengths)

            unk_mask = get_mask(target_seqs, unk_indices)
            decoder_mask = tf.logical_and(sequence_mask,
                                          tf.logical_not(unk_mask))
            decoder_mask_float = tf.cast(decoder_mask, tf.float32)

            # Embed inputs
            with tf.variable_scope("embedding"):
                embedding = create_embedding(embedding_word2vec_politeness,
                                             embedding_word2vec_movie,
                                             shared_vocab_size_politeness,
                                             shared_vocab_size_movie,
                                             new_vocab_size_politeness,
                                             new_vocab_size_movie, "seq2seq")
                embedded_input_seqs = tf.nn.embedding_lookup(
                    embedding, input_seqs)
                embedded_target_seqs = tf.nn.embedding_lookup(
                    embedding, target_seqs)

            # Optimizer
            optimizer = tf.train.AdamOptimizer(learning_rate)

            tower_grads = []
            if credit_assignment:
                tower_grads_polite = []
            sample_ids_lst = []
            final_lengths_lst = []
            sampled_sample_ids_lst = []
            sampled_final_lengths_lst = []
            reuse = False
            trainable_variables = []

            num_tokens_lst = []
            total_losses = []
    for i in xrange(num_gpus):
        with tf.device("/gpu:%d" % (gpu_start_index + i)):
            with tf.variable_scope("seq2seq"):
                if (i == 1):
                    reuse = True

                start = i * batch_size_per_gpu
                end = start + batch_size_per_gpu

                input_max_seq_length = tf.reduce_max(
                    input_seq_lengths[start:end])
                target_max_seq_length = tf.reduce_max(
                    target_seq_lengths[start:end])

                with tf.variable_scope("encoder", reuse=reuse):
                    cell_fw = create_MultiRNNCell([hidden_size_encoder] *
                                                  (num_layers_encoder // 2),
                                                  keep_prob,
                                                  num_proj=None,
                                                  reuse=reuse)
                    cell_bw = create_MultiRNNCell([hidden_size_encoder] *
                                                  (num_layers_encoder // 2),
                                                  keep_prob,
                                                  num_proj=None,
                                                  reuse=reuse)
                    (encoder_outputs_original, encoder_final_state_original
                     ) = bidirecitonal_dynamic_lstm(
                         cell_fw, cell_bw, embedded_input_seqs[
                             start:end, :input_max_seq_length, :],
                         input_seq_lengths[start:end])

                    [
                        encoder_outputs, encoder_seq_lengths,
                        encoder_final_state
                    ] = tf.cond(is_training, lambda: [
                        encoder_outputs_original, input_seq_lengths[start:end],
                        encoder_final_state_original
                    ], lambda: [
                        tf.contrib.seq2seq.tile_batch(encoder_outputs_original,
                                                      beam_width),
                        tf.contrib.seq2seq.tile_batch(
                            input_seq_lengths[start:end], beam_width),
                        tile_multi_cell_state(encoder_final_state_original)
                    ])  # only works for decoder that has >1 layers!

                with tf.variable_scope("decoder", reuse=reuse):
                    decoder_cell = create_MultiRNNCell(
                        [hidden_size_decoder] * (num_layers_decoder),
                        keep_prob,
                        num_proj=vocab_size,
                        memory=encoder_outputs,
                        memory_seq_lengths=encoder_seq_lengths,
                        reuse=reuse)

                    decoder_zero_state = tf.cond(
                        is_training, lambda: decoder_cell.zero_state(
                            batch_size_per_gpu, tf.float32),
                        lambda: decoder_cell.zero_state(
                            batch_size_per_gpu * beam_width, tf.float32))

                    state_last = decoder_zero_state[-1].clone(
                        cell_state=encoder_final_state[-1])
                    state_previous = encoder_final_state[:-1]
                    decoder_initial_state = state_previous + (
                        state_last, )  # concat tuples

                    # training helper (for teacher forcing)
                    helper_train = tf.contrib.seq2seq.TrainingHelper(
                        embedded_target_seqs[
                            start:end, :target_max_seq_length -
                            1, :],  # get rid of end_token
                        target_seq_lengths[start:end] -
                        1)  # the length is thus decreased by 1

                    (decoder_outputs_train,
                     _) = decode(decoder_cell,
                                 helper_train,
                                 initial_state=decoder_initial_state)
                    (logits, _) = decoder_outputs_train

                    # Get trainable_variables
                    # (up to now we already have all the seq2seq trainable vars)
                    if trainable_variables == []:
                        trainable_variables = tf.get_collection(
                            tf.GraphKeys.TRAINABLE_VARIABLES, scope="seq2seq")

                    loss_ML = tf.contrib.seq2seq.sequence_loss(
                        logits,
                        target_seqs[
                            start:end,
                            1:target_max_seq_length],  # get rid of start_token
                        decoder_mask_float[start:end, 1:target_max_seq_length])
                    num_tokens = tf.reduce_sum(
                        decoder_mask_float[start:end, 1:target_max_seq_length])

                    num_tokens_lst.append(num_tokens)

                    total_loss = loss_ML * num_tokens
                    total_losses.append(total_loss)

                    if polite_training:
                        helper_sample = tf.contrib.seq2seq.SampleEmbeddingHelper(
                            embedding, start_tokens[start:end], end_token)
                        (decoder_outputs_sample,
                         final_lengths_sample) = decode(
                             decoder_cell, helper_sample,
                             decoder_initial_state)
                        (logits_sample,
                         sample_ids_sample) = decoder_outputs_sample
                        max_final_lengths_sample = tf.reduce_max(
                            final_lengths_sample)
                        sampled_sample_ids_lst.append(
                            pad_and_truncate(sample_ids_sample,
                                             final_lengths_sample))
                        sampled_final_lengths_lst.append(final_lengths_sample)

                        # Compute sampled sequence loss WITHOUT averaging (will do that later)
                        decoder_mask_sample = get_sequence_mask(
                            final_lengths_sample, dtype=tf.float32)
                        seq_losses_sample = tf.contrib.seq2seq.sequence_loss(
                            logits_sample,
                            sample_ids_sample,
                            decoder_mask_sample,
                            average_across_timesteps=False,
                            average_across_batch=False)

            if polite_training:
                with tf.variable_scope(
                        "classifier"):  # jump back to the classifier scope
                    # Filter out tokens that the classifier doesn't know
                    vocab_mask = tf.cast(
                        sample_ids_sample < vocab_size_politeness, tf.int32)
                    sample_ids_sample_classifier = sample_ids_sample * vocab_mask

                    # Feed sampled ids to classifier
                    (scores_RL, credit_weights_RL) = build_classifier(
                        sample_ids_sample_classifier, final_lengths_sample,
                        reuse)

                    # Stop gradients from propagating back
                    scores_RL_stop = tf.stop_gradient(scores_RL)
                    credit_weights_RL_stop = tf.stop_gradient(
                        credit_weights_RL)

                    if thresholding:
                        # Filter scores that are >= threshold and <= 1 - threshold
                        filtered_scores_RL = tf.map_fn(filter_with_threshold,
                                                       scores_RL_stop)
                    else:
                        filtered_scores_RL = scores_RL_stop

                with tf.variable_scope("seq2seq"):
                    with tf.variable_scope("decoder", reuse=reuse):
                        # Get valid mask for sampled sequence
                        decoder_mask_classifier = tf.cast(
                            tf.not_equal(sample_ids_sample, 0), tf.float32
                        )  # propagate back the whole sentence (including <end>)

                        tiled_scores = tf.tile(  # tile scores to 2D
                            tf.expand_dims(filtered_scores_RL - baseline,
                                           axis=1),
                            [1, max_final_lengths_sample])

                        if flip_polite:  # if we actually want a rude dialogue system
                            tiled_scores = -1.0 * tiled_scores

                        # Compute seq losses for polite-RL
                        seq_losses_classifier = (
                            beta * seq_losses_sample *
                            decoder_mask_classifier /
                            tf.reduce_sum(decoder_mask_classifier) *
                            tiled_scores)

                        if credit_assignment:
                            grads_polite = tf.gradients(
                                seq_losses_classifier,
                                trainable_variables,
                                grad_ys=credit_weights_RL_stop
                            )  # credit weights as initial gradients
                            grads_polite = zip_lsts(
                                [grads_polite, trainable_variables])
                            tower_grads_polite.append(grads_polite)
                        else:
                            loss_polite = tf.reduce_sum(seq_losses_classifier)
            else:
                credit_weights_RL_stop = None

            with tf.variable_scope("seq2seq"):
                with tf.variable_scope("decoder", reuse=reuse):
                    # Infer branch (beam search!)
                    beam_search_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                        decoder_cell,
                        embedding,
                        start_tokens[start:end],
                        end_token,
                        decoder_initial_state,
                        beam_width,
                        length_penalty_weight=length_penalty_weight)
                    output_beam = tf.contrib.seq2seq.dynamic_decode(
                        beam_search_decoder,
                        #                     impute_finished=True, # cannot be used with Beamsearch
                        maximum_iterations=max_iterations,
                        swap_memory=True)
                    sample_ids = output_beam[0].predicted_ids[:, :, 0]
                    final_lengths = output_beam[2][:, 0]

                    sample_ids_lst.append(
                        pad_and_truncate(sample_ids, final_lengths))
                    final_lengths_lst.append(final_lengths)

        with tf.device("/gpu:%d" % (gpu_start_index + i)):
            with tf.variable_scope("seq2seq", reuse=reuse):
                # Compute loss
                loss = loss_ML

                if polite_training and not credit_assignment:
                    loss = loss + loss_polite

                # Compute tower gradients
                grads = compute_grads(loss, optimizer, trainable_variables)
                tower_grads.append(grads)

    with tf.device('/cpu:0'):
        with tf.variable_scope("seq2seq"):
            # Concat sample ids and their respective lengths
            batch_sample_ids = tf.concat(sample_ids_lst, axis=0)
            batch_final_lengths = tf.concat(final_lengths_lst, axis=0)

            if polite_training:
                batch_sampled_sample_ids = tf.concat(sampled_sample_ids_lst,
                                                     axis=0)

            batch_total_loss = tf.add_n(total_losses)
            batch_num_tokens = tf.add_n(num_tokens_lst)

            # Thus, the effective batch size is actually batch_size_per_gpu
            if polite_training and credit_assignment:
                apply_gradients_op = apply_multiple_grads(
                    optimizer, [tower_grads, tower_grads_polite])
            else:
                apply_gradients_op = apply_grads(optimizer, tower_grads)

    return (batch_sample_ids, batch_final_lengths, batch_total_loss,
            batch_num_tokens, apply_gradients_op, credit_weights_RL_stop,
            embedding)
    def __init__(
        self,
        batch_size,
        vocab_size,
        embedding_size,
        hidden_size_encoder,
        hidden_size_decoder,
        max_iterations,
        start_token,
        end_token,
        unk_indices,
        num_layers_encoder=1,
        num_layers_decoder=1,
        attention_size=512,
        attention_layer_size=256,
        beam_width=10,
        length_penalty_weight=1.0,
        gpu_start_index=0,
        num_gpus=1,  # set to 1 when testing
        learning_rate=0.001,
        clipping_threshold=5.0,
        feed_both_examples=False,
        use_max_margin=False,
        max_margin_weight=1.0,
        margin=0.1,
        reward_clipping_threshold=1.0,
        backward=False,  # whether we are training a backward model
        feed_tensors=[],  # when provided, placeholders are not used
        use_MMI_reward=False,
        MMI_weight=0.00,
        use_reranking_reward=False,
        reranking_weight=0.00,
        num_samples_reranking=2,
        use_gleu_reward=False,
        gleu_weight=0.00,
        softmax_temperature=1.0,
        beam_search=True
    ):  # how many samples to use for baseline (RL training)
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size_encoder = hidden_size_encoder
        self.hidden_size_decoder = hidden_size_decoder
        assert self.hidden_size_encoder * 2 == self.hidden_size_decoder
        self.max_iterations = max_iterations
        self.start_token = start_token
        self.end_token = end_token
        self.unk_indices = unk_indices
        self.num_layers_encoder = num_layers_encoder
        self.num_layers_decoder = num_layers_decoder
        self.attention_size = attention_size
        self.attention_layer_size = attention_layer_size
        self.beam_width = beam_width
        self.length_penalty_weight = length_penalty_weight
        self.gpu_start_index = gpu_start_index
        self.num_gpus = num_gpus
        self.learning_rate = learning_rate
        self.clipping_threshold = clipping_threshold
        self.feed_both_examples = feed_both_examples
        if self.feed_both_examples:
            assert self.batch_size % 2 == 0
            self.half_batch_size = self.batch_size // 2
        self.use_max_margin = use_max_margin
        if self.use_max_margin:
            assert self.feed_both_examples  # if Should-Change, then feed_both_examples must be True
        self.max_margin_weight = max_margin_weight
        self.margin = margin
        self.beam_search = beam_search

        assert self.batch_size % self.num_gpus == 0
        self.batch_size_per_gpu = self.batch_size // self.num_gpus
        self.feed_tensors = feed_tensors

        self.use_MMI_reward = use_MMI_reward
        self.MMI_weight = MMI_weight
        self.use_reranking_reward = use_reranking_reward
        self.reranking_weight = reranking_weight
        self.num_samples_reranking = num_samples_reranking
        self.use_gleu_reward = use_gleu_reward
        self.gleu_weight = gleu_weight
        self.RL_training = self.use_MMI_reward or self.use_reranking_reward or self.use_gleu_reward

        self.softmax_temperature = softmax_temperature

        if self.feed_both_examples:
            print("Feeding both examples...")
            assert self.batch_size % 2 == 0
            self.norm_batch_size = self.batch_size // 2
        else:
            self.norm_batch_size = self.batch_size  # when feeding both examples, only first half are norm inputs

        if self.use_max_margin:
            print("Max margin weight: {}, margin: {}".format(
                self.max_margin_weight, self.margin))
        # We are only performing RL training on the norm_batch_size part, which may or may not be the whole batch

        if self.use_MMI_reward:
            print("MMI weight:", self.MMI_weight)
#             self.softmax_temperature = 0.5
#             print("softmax_temperature changed to {}".format(self.softmax_temperature))
        else:
            self.MMI_weight = 0.0

        if self.use_reranking_reward:
            print("Neural Reranking reward weight:", self.reranking_weight)
        else:
            self.reranking_weight = 0.0

        if self.use_gleu_reward:
            print("GLEU reward weight:", self.gleu_weight)
        else:
            self.gleu_weight = 0.0

        self.ML_weight = 1.0 - (self.MMI_weight + self.reranking_weight +
                                self.gleu_weight)

        self.trainable_variables = []
        if self.use_MMI_reward:
            self.trainable_variables_backward = []

        # For a backward model, the namespace will be "seq2seq_backward"
        extra_str = "_backward" if backward else ""
        self.main_scope = "seq2seq" + extra_str + "/"
        with tf.device("/gpu:%d" % self.gpu_start_index):
            self.create_placeholders()

            # Tile only if we are not training and using beam search
            self.tile = tf.logical_and(tf.logical_not(self.is_training),
                                       self.beam_search)

            self.global_step = tf.get_variable(self.main_scope + "global_step",
                                               initializer=0,
                                               dtype=tf.int32,
                                               trainable=False)

            # Note: if feeding both examples, the first dimension of total_loss is twice as that of
            # loss_RL's
            (self.total_loss, max_margin_loss, num_tokens, loss_MMI, loss_gleu,
             loss_reranking, num_tokens_RL, self.batch_sample_ids_beam,
             self.batch_final_lengths_beam) = self.one_iteration(
                 self.source, self.source_length, self.target,
                 self.target_length, self.start_tokens)

            # This part is for monitoring PPL, not for training.
            if self.feed_both_examples:
                self.batch_num_tokens = num_tokens / 2.0
                # We montior ML losses for both norm- and adv-data
                self.batch_total_loss = (tf.reduce_sum(
                    self.norm(self.total_loss)),
                                         tf.reduce_sum(
                                             self.adv(self.total_loss)))
            else:
                self.batch_num_tokens = num_tokens
                self.batch_total_loss = tf.reduce_sum(self.total_loss)

            loss_terms = []

            # when using max_margin, it must be a Should-Change strategy
            if self.use_max_margin:
                loss_ML = self.norm(
                    self.total_loss
                )  # in this case we don't want to train on (adv-S, T) pairs
                num_tokens_ML = num_tokens / 2.0
            else:
                #                 if self.feed_both_examples:
                #                     # This is just for code readability
                #                     # We could have just written loss_ML = self.total_loss / 2.0
                #                     loss_ML = (self.norm(self.total_loss) + self.adv(self.total_loss)) / 2.0
                #                 else:
                #                     loss_ML = self.total_loss
                loss_ML = self.total_loss
                num_tokens_ML = num_tokens

            loss_terms.append(self.ML_weight * tf.reduce_sum(loss_ML) /
                              num_tokens_ML)

            if self.use_max_margin:
                # need to scale max_margin_weight by ML_weight to make the training stable
                # (instead of scaling with loss_ML + loss_RL)
                loss_terms.append(
                    self.max_margin_weight * self.ML_weight *
                    #                     tf.reduce_sum(max_margin_loss) / num_tokens_ML)
                    tf.reduce_mean(max_margin_loss))

            if self.RL_training and not self.use_max_margin:
                num_tokens_RL = num_tokens_RL / 2.0  # effectively double the RL_loss

            if self.use_MMI_reward:
                loss_terms.append(self.MMI_weight * tf.reduce_sum(loss_MMI) /
                                  num_tokens_RL)

            if self.use_reranking_reward:
                loss_terms.append(self.reranking_weight *
                                  tf.reduce_sum(loss_reranking) /
                                  num_tokens_RL)

            if self.use_gleu_reward:
                loss_terms.append(self.gleu_weight * tf.reduce_sum(loss_gleu) /
                                  num_tokens_RL)

            assert loss_terms != []
            loss = tf.add_n(loss_terms)

            optimizer = tf.train.AdamOptimizer(self.learning_rate)
            grads = compute_grads(loss, optimizer, self.trainable_variables)
            self.apply_gradients_op = apply_grads(
                optimizer, [grads],
                clipping_threshold=self.clipping_threshold,
                global_step=self.global_step)
Example #4
0
    def __init__(self,
                 batch_size,
                 vocab_size,
                 embedding_size,
                 hidden_size_encoder,
                 hidden_size_context,
                 hidden_size_decoder,
                 max_iterations,
                 max_dialogue_length,
                 start_token,
                 end_token,
                 unk_indices,
                 num_layers_encoder=1,
                 num_layers_context=1,
                 num_layers_decoder=1,
                 attention_size=512,
                 attention_layer_size=256,
                 beam_search=False,
                 beam_width=10,
                 length_penalty_weight=1.0,
                 gpu_start_index=0,
                 learning_rate=0.001,
                 clipping_threshold=5.0,
                 feed_both_examples=False,
                 use_max_margin=False,
                 max_margin_weight=1.0,
                 margin=0.5):
        self.batch_size = batch_size
        self.half_batch_size = self.batch_size // 2
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size_encoder = hidden_size_encoder
        self.hidden_size_context = hidden_size_context
        self.hidden_size_decoder = hidden_size_decoder
        self.dim_z = self.hidden_size_context  # this decision is arbitrary

        self.create_context_initial_state_var = functools.partial(
            tf.get_variable,
            initializer=tf.zeros([self.batch_size, self.hidden_size_context]),
            dtype=tf.float32,
            trainable=False)

        self.dense = functools.partial(tf.layers.dense,
                                       units=self.dim_z,
                                       use_bias=True)

        self.max_iterations = max_iterations
        self.max_dialogue_length = max_dialogue_length
        assert self.max_dialogue_length > 0

        self.start_token = start_token
        self.end_token = end_token
        self.unk_indices = unk_indices
        self.num_layers_encoder = num_layers_encoder
        self.num_layers_context = num_layers_context
        self.num_layers_decoder = num_layers_decoder
        self.attention_size = attention_size
        self.attention_layer_size = attention_layer_size
        self.beam_search = beam_search
        self.beam_width = beam_width
        self.length_penalty_weight = length_penalty_weight
        self.clipping_threshold = clipping_threshold
        self.feed_both_examples = feed_both_examples
        if self.feed_both_examples:
            assert self.batch_size % 2 == 0
        self.use_max_margin = use_max_margin
        if self.use_max_margin:
            assert self.feed_both_examples
        self.max_margin_weight = max_margin_weight
        self.margin = margin

        self.prior_str = "prior"
        self.posterior_str = "posterior"
        """
        context_input_acc put context inputs in reverse order.
            • When adding inputs, just concat from left
            • When using, we do not need to reverse it back because we are using bidirectional-lstm
                Note: If we don't use bidirectional-lstm, then we will need tf.reverse_sequence()
        """
        self.trainable_variables = []

        with tf.variable_scope("seq2seq"):
            self.create_placeholders()

            # Tile only if we are not training and using beam search
            self.tile = tf.logical_and(tf.logical_not(self.is_training),
                                       self.beam_search)

            context_input_acc = tf.get_variable(
                "context_input_acc",
                initializer=tf.zeros(
                    [self.batch_size, 2, self.hidden_size_encoder * 2],
                    dtype=tf.float32),
                trainable=False)
            self.global_step = tf.get_variable("global_step",
                                               initializer=0,
                                               dtype=tf.int32,
                                               trainable=False)

            # max_num_turns better not be less than 2, since we may just lose a whole dimension (i.e., axis=1)?
            max_num_turns = tf.maximum(tf.reduce_max(self.start_turn_index), 2)
            context_input_mask = tf.tile(
                tf.reshape(tf.greater(self.start_turn_index, 0),
                           [self.batch_size, 1, 1]),  # expand two dims
                [1, max_num_turns, self.hidden_size_encoder * 2])
            # This multiplication resets context input that have start_turn_index == 0
            previous_context_input = context_input_acc[:, :
                                                       max_num_turns, :] * tf.cast(
                                                           context_input_mask,
                                                           tf.float32)

            optimizer = tf.train.AdamOptimizer(learning_rate)

            with tf.device("/gpu:%d" % gpu_start_index):
                (kl_loss, total_loss, num_tokens, max_margin_loss,
                 context_input, sample_ids,
                 final_lengths) = self.one_iteration(self.dialogue,
                                                     self.turn_length,
                                                     previous_context_input,
                                                     self.start_turn_index,
                                                     self.start_tokens)

                kl_loss_weight = tf.cond(
                    self.tile, lambda: tf.minimum(
                        1.0 / 75000.0 * tf.cast(self.global_step, tf.float32),
                        1.0), lambda: 1.0)
                if self.use_max_margin:
                    kl_loss = kl_loss[:self.half_batch_size]
                weighted_kl_loss = kl_loss_weight * tf.reduce_mean(kl_loss)

                if self.use_max_margin:
                    num_tokens = num_tokens / 2.0
                    # Note: here total_loss is a 1-D vector (already summed with axis=1)
                    cross_ent_loss = tf.reduce_sum(
                        total_loss[:self.half_batch_size])
                    adv_cross_ent_loss = tf.reduce_sum(
                        total_loss[self.half_batch_size:])
                    self.batch_total_loss = (cross_ent_loss,
                                             adv_cross_ent_loss)

                    loss = (weighted_kl_loss + cross_ent_loss / num_tokens +
                            self.max_margin_weight *
                            tf.reduce_mean(max_margin_loss)
                            )  # max_margin_loss shape=[self.half_batch_size]
                else:
                    cross_ent_loss = tf.reduce_sum(total_loss)
                    self.batch_total_loss = cross_ent_loss
                    loss = weighted_kl_loss + cross_ent_loss / num_tokens

                self.batch_num_tokens = num_tokens

                grads = compute_grads(loss, optimizer,
                                      self.trainable_variables)

                # First context input will be repeated in the next batch, so we ignore it.
                assign_context_input_op = tf.assign(
                    context_input_acc,
                    context_input[:, 1:, :],
                    validate_shape=False)  # shape will be differnt on axis=1

                with tf.control_dependencies([
                        assign_context_input_op
                ]):  # make sure we update context_input_acc
                    self.apply_gradients_op = apply_grads(
                        optimizer, [grads],
                        clipping_threshold=self.clipping_threshold,
                        global_step=self.global_step)

                    # Just for control dependencies
                    self.batch_sample_ids_beam = tf.identity(sample_ids)
                    self.batch_final_lengths_beam = tf.identity(final_lengths)
Example #5
0
    def __init__(self,
                 batch_size,
                 vocab_size,
                 embedding_size,
                 hidden_size_encoder, hidden_size_context, hidden_size_decoder,
                 dim_z,
                 max_iterations, max_dialogue_length,
                 start_token, end_token, unk_indices,
                 num_layers_encoder=1, num_layers_context=1, num_layers_decoder=1,
                 attention_size=512, attention_layer_size=256,
                 beam_width=10, length_penalty_weight=1.0,
                 gpu_start_index=0, 
                 num_gpus=1, # set to 1 when testing
                 learning_rate=0.001, 
                 clipping_threshold=5.0,
                 truncated=True):
        self.batch_size = batch_size
        self.batch_size_per_gpu = batch_size // num_gpus
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size_encoder = hidden_size_encoder
        self.hidden_size_context = hidden_size_context
        self.hidden_size_decoder = hidden_size_decoder
        assert self.hidden_size_context * 2 == self.hidden_size_decoder
        
        self.dim_z = dim_z
        self.dense = functools.partial(tf.layers.dense, units=self.dim_z, use_bias=True)
        
        self.max_iterations = max_iterations
        self.max_dialogue_length = max_dialogue_length
        assert self.max_dialogue_length > 0
        
        self.start_tokens = [start_token] * self.batch_size_per_gpu
        self.end_token = end_token
        self.unk_indices = unk_indices
        self.num_layers_encoder = num_layers_encoder
        self.num_layers_context = num_layers_context
        self.num_layers_decoder = num_layers_decoder
        self.attention_size = attention_size
        self.attention_layer_size = attention_layer_size
        self.beam_width = beam_width
        self.length_penalty_weight = length_penalty_weight
        self.num_gpus = num_gpus
        self.clipping_threshold = clipping_threshold
        self.truncated = truncated
        """
        context_input_acc put context inputs in reverse order.
            • When adding inputs, just concat from left
            • When using, we do not need to reverse it back because we are using bidirectional-lstm
                Note: If we don't use bidirectional-lstm, then we will need tf.reverse_sequence()
        """
        self.trainable_variables = []
        
        with tf.variable_scope("seq2seq"):
            with tf.device('/cpu:0'):
                self.create_placeholders()
                
                context_input_acc = tf.get_variable(
                    "context_input_acc",
                    initializer=tf.zeros([self.batch_size, 2, self.hidden_size_encoder * 2], dtype=tf.float32),
                    trainable=False)
                
                # max_num_turns better not be less than 2, since we may just lose a whole dimension (i.e., axis=1)?
                max_num_turns = tf.maximum(tf.reduce_max(self.start_turn_index), 2)
                context_input_mask = tf.tile(
                    tf.reshape(tf.greater(self.start_turn_index, 0), [self.batch_size, 1, 1]), # expand two dims
                    [1, max_num_turns, self.hidden_size_encoder * 2])
                # This multiplication resets context input that have start_turn_index == 0
                previous_context_input = context_input_acc[:, :max_num_turns, :] * tf.cast(context_input_mask, tf.float32)
 
                # Note: Make sure batch_size can be evenly divided by num_gpus
                [dialogue_lst, turn_length_lst,
                 previous_context_input_lst, 
                 start_turn_index_lst] = [
                    tf.split(tensor, self.num_gpus, axis=0)
                    for tensor 
                    in [self.dialogue, self.turn_length,
                        previous_context_input, #cannot be less than 2, otherwise tf.map_fn will give error.
                        self.start_turn_index]]

                optimizer = tf.train.AdamOptimizer(learning_rate)

                context_input_lst = []
                sample_ids_beam_lst = []
                final_lengths_beam_lst = []
                num_tokens_lst = []
                total_losses = []
                tower_grads = []
            for i in xrange(num_gpus):
                with tf.device("/gpu:%d" % (gpu_start_index + i)):                    
                    (total_loss, num_tokens, context_input,
                     sample_ids_beam, final_lengths_beam) = self.one_iteration(
                        dialogue_lst[i], turn_length_lst[i], 
                        previous_context_input_lst[i], start_turn_index_lst[i],
                        optimizer)
                    
                    # first turn will be repeated in the next batch, so we skip it
                    context_input_lst.append(context_input[:, 1:, :])
                    
                    sample_ids_beam_lst.append(sample_ids_beam)
                    final_lengths_beam_lst.append(final_lengths_beam)
                    
                    grads = compute_grads(
                        total_loss / num_tokens, optimizer, self.trainable_variables)
                    tower_grads.append(grads)
                    
                    total_losses.append(total_loss)
                    num_tokens_lst.append(num_tokens)
                    
            with tf.device('/cpu:0'):
                context_input_concat = tf.concat(context_input_lst, axis=0)
                assign_context_input_op = tf.assign(
                    context_input_acc, context_input_concat, validate_shape=False) # shape will be differnt on axis=1
                with tf.control_dependencies([assign_context_input_op]): # make sure we update context_input_acc
                    # Concat sample ids and their respective lengths
                    self.batch_sample_ids_beam = tf.concat(
                        sample_ids_beam_lst, axis=0)
                    self.batch_final_lengths_beam = tf.concat(
                        final_lengths_beam_lst, axis=0)

                    self.batch_total_loss = tf.add_n(total_losses)
                    self.batch_num_tokens = tf.add_n(num_tokens_lst)

                    self.apply_gradients_op = apply_grads(
                        optimizer, tower_grads, clipping_threshold=self.clipping_threshold)