Beispiel #1
0
def attention_decoder_with_embedding(decoder_inputs, initial_state, attention_states,
                                     cell, embedding, num_heads=1,
                                     output_size=None, dtype=dtypes.float32, scope=None,
                                     initial_state_attention=False):
    """
    We are not using output_projection because we are NOT using a sampled softmax

    Parameters
    ----------
    decoder_inputs
    initial_state
    attention_states
    cell
    embedding: outside embedding passed in
    num_heads
    output_size
    dtype
    scope
    initial_state_attention

    Returns
    -------

    """
    if output_size is None:
        output_size = cell.output_size

    with vs.variable_scope(scope or "attention_decoder_with_embedding"):
        emb_inp = [
            embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs]
        return attention_decoder(
            emb_inp, initial_state, attention_states, cell, output_size=output_size,
            num_heads=num_heads, loop_function=None,
            initial_state_attention=initial_state_attention)
Beispiel #2
0
def attention_decoder_with_embedding(decoder_inputs,
                                     initial_state,
                                     attention_states,
                                     cell,
                                     embedding,
                                     num_heads=1,
                                     output_size=None,
                                     dtype=dtypes.float32,
                                     scope=None,
                                     initial_state_attention=False):
    """
    We are not using output_projection because we are NOT using a sampled softmax

    Parameters
    ----------
    decoder_inputs
    initial_state
    attention_states
    cell
    embedding: outside embedding passed in
    num_heads
    output_size
    dtype
    scope
    initial_state_attention

    Returns
    -------

    """
    if output_size is None:
        output_size = cell.output_size

    with vs.variable_scope(scope or "attention_decoder_with_embedding"):
        emb_inp = [
            embedding_ops.embedding_lookup(embedding, i)
            for i in decoder_inputs
        ]
        return attention_decoder(
            emb_inp,
            initial_state,
            attention_states,
            cell,
            output_size=output_size,
            num_heads=num_heads,
            loop_function=None,
            initial_state_attention=initial_state_attention)
Beispiel #3
0
def simple_attentional_rnn(rnn_input,
                           attention_state_list,
                           initial_state=None):
    """Implements Simple RNN
  Args:
  rnn_input: List of tensors of sizes [-1, sentembed_size]
  attention_state_list: List of tensors of sizes [-1, sentembed_size]
  Returns:
  outputs, state
  """

    # Reshape attention_state_list to tensor
    attention_states = reshape_list2tensor(attention_state_list,
                                           len(attention_state_list),
                                           FLAGS.sentembed_size)

    # Setup cell
    cell = get_lstm_cell()

    # Apply dropout
    in_prob = FLAGS.dropout if FLAGS.use_dropout else 1.0
    out_prob = FLAGS.dropout if FLAGS.use_dropout_outatt else 1.0
    cell = tf.nn.rnn_cell.DropoutWrapper(cell,
                                         input_keep_prob=in_prob,
                                         output_keep_prob=out_prob)

    # Setup attentional RNNs
    dtype = tf.float16 if FLAGS.use_fp16 else tf.float32

    # if initial_state == None:
    #   batch_size = tf.shape(rnn_input[0])[0]
    #   initial_state = cell.zero_state(batch_size, dtype)

    rnn_outputs, rnn_state = seq2seq.attention_decoder(
        rnn_input,
        initial_state,
        attention_states,
        cell,
        output_size=None,
        num_heads=1,
        loop_function=None,
        dtype=dtype,
        scope=None,
        initial_state_attention=False)
    # print(rnn_outputs)
    # print(rnn_state)
    return rnn_outputs, rnn_state
Beispiel #4
0
def attention_decoder( batch_input_shape, cells, code, annotation, keep_prob, **kwargs ):
    # Recieve arguments
    batch_size, timestep, feature = batch_input_shape

    assert len(cells) == 1, "One cell needed!"
    de_cell = cells[0]
    
    hidden_dim = de_cell.output_size

    # Start building graph
    code_dropout = tf.nn.dropout(code, keep_prob)
    
    code_dim = int( code_dropout.get_shape()[1] ) 
    
    rest_of_decoder_inputs = [ tf.placeholder(tf.float32, shape=[ batch_size, code_dim ]) for _ in range(timestep-1) ]

    decoder_inputs_dropout = [ code_dropout ] + \
            [ tf.nn.dropout(inp, keep_prob) for inp in rest_of_decoder_inputs ] 

    def loop(prev, i):
            return prev # Output as input
    
    packed_annotation = tf.transpose(tf.pack(annotation), perm=[1,0,2])
   
    decoder_outputs, decoder_state = seq2seq.attention_decoder( decoder_inputs_dropout, de_cell.zero_state(batch_size,tf.float32), packed_annotation ,de_cell, loop_function = loop )
   
    W_out = tf.get_variable("W_out", shape=[hidden_dim, feature],
                       initializer=tf.contrib.layers.xavier_initializer())
    
    b_out = tf.Variable( tf.zeros([ feature ] ) )

    unpacked_reconstruction = [ tf.matmul( tf.nn.dropout( out, keep_prob ), W_out ) for out in decoder_outputs ]

    recX = tf.nn.relu( tf.transpose(tf.pack(unpacked_reconstruction), perm=[1, 0, 2]) )

    return recX
Beispiel #5
0
    def __init__(self, args, infer=False):
        """
        数据预处理完成以后,接下来就是建立seq2seq模型了。建立模型主要分为三步:
        确定好编码器和解码器中cell的结构,即采用什么循环单元,多少个神经元以及多少个循环层;
        将输入数据转化成tensorflow的seq2seq.rnn_decoder需要的格式,并得到最终的输出以及最后一个隐含状态;
        将输出数据经过softmax层得到概率分布,并且得到误差函数,确定梯度下降优化器;

        由于tensorflow提供的rnncell共有三种,分别是RNN、GRU、LSTM,因此这里我们也提供三种选择,并且每一种都可以使用多层结构,
        即MultiRNNCell
        :param args: 
        :param infer: 
        """
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.rnncell == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.rnncell == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.rnncell == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            raise Exception("rnncell type not supported: {}".format(
                args.rnncell))

        cell = cell_fn(args.rnn_size)
        self.cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)
        self.input_data = tf.placeholder(tf.int32,
                                         [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32,
                                      [args.batch_size, args.seq_length])
        self.initial_state = self.cell.zero_state(args.batch_size, tf.float32)
        with tf.variable_scope('rnnlm'):
            softmax_w = build_weight([args.rnn_size, args.vocab_size],
                                     name='soft_w')
            softmax_b = build_weight([args.vocab_size], name='soft_b')
            word_embedding = build_weight(
                [args.vocab_size, args.embedding_size], name='word_embedding')
            inputs_list = tf.split(
                1, args.seq_length,
                tf.nn.embedding_lookup(word_embedding, self.input_data))
            inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(word_embedding, prev_symbol)

        # 用于建立seq2seq的函数,rnn_decoder以及attention_decoder
        if not args.attention:
            outputs, last_state = seq2seq.rnn_decoder(
                inputs_list,
                self.initial_state,
                self.cell,
                loop_function=loop if infer else None,
                scope='rnnlm')
            # rnn_decoder函数主要有四个参数
            # decoder_inputs其实就是输入的数据,要求的格式为一个list,并且list中的tensor大小应该为[batch_size,input_size],
            # 换句话说这个list的长度就是seq_length;但我们原始的输入数据的维度为[args.batch_size, args.seq_length],
            # 是不是感觉缺少了一个input_size维度,其实这个维度就是word_embedding的维度,或者说word2vec的大小,
            # 这里需要我们手动进行word_embedding,并且这个embedding矩阵是一个可以学习的参数

            # initial_state是cell的初始状态,其维度是[batch_size,cell.state_size],
            # 由于rnn_cell模块提供了对状态的初始化函数,因此我们可以直接调用

            # cell就是我们要构建的解码器和编码器的cell,上面已经提过了。
            # 最后一个参数是loop_function,其作用是在生成的时候,我们需要把解码器上一时刻的输出作为下一时刻的输入,
            # 并且这个loop_function需要我们自己写

            # 其中outputs是与decoder_inputs同样维度的量,即每一时刻的输出;
            # last_state的维度是[batch_size,cell.state_size],即最后时刻的所有cell的状态。
            # 接下来需要outputs来确定目标函数,而last-state的作用是作为抽样生成函数下一时刻的状态

        else:
            self.attn_length = 5
            self.attn_size = 32
            self.attention_states = build_weight(
                [args.batch_size, self.attn_length, self.attn_size])
            outputs, last_state = seq2seq.attention_decoder(
                inputs_list,
                self.initial_state,
                self.attention_states,
                self.cell,
                loop_function=loop if infer else None,
                scope='rnnlm')

        self.final_state = last_state
        output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([args.batch_size * args.seq_length])], args.vocab_size)

        # tensorflow中提供了sequence_loss_by_example函数用于按照权重来计算整个序列中每个单词的交叉熵,
        # 返回的是每个序列的log-perplexity。为了使用sequence_loss_by_example函数,
        # 我们首先需要将outputs通过一个前向层,同时我们需要得到一个softmax概率分布

        # average loss for each word of each timestep
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.lr = tf.Variable(0.0, trainable=False)
        self.var_trainable_op = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.cost, self.var_trainable_op), args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)

        # train_op即为训练时需要运行的
        self.train_op = optimizer.apply_gradients(
            zip(grads, self.var_trainable_op))
        self.initial_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=5,
                                    keep_checkpoint_every_n_hours=1)
        self.logfile = args.log_dir + str(
            datetime.datetime.strftime(datetime.datetime.now(),
                                       '%Y-%m-%d %H:%M:%S') + '.txt').replace(
                                           ' ', '').replace('/', '')
        self.var_op = tf.global_variables()
    def __init__(self, config, mode='TRAIN', loaded_word_embed=None):
        """Builds the computing graph and initializes all variabels.

        Args:
            config: Configuration object contains all model configuration.
            mode: String from {'TRAIN', 'EVAL', 'INFER'}.
            loaded_word_embed: A numpy array of pretrained word embedding.
        """
        # Initilizes model parameters.
        self.batch_size = batch_size = config.batch_size
        self.vocab_size = vocab_size = config.vocab_size
        self.embed_dim = embed_dim = config.embed_dim
        self.hidden_dim = hidden_dim = config.hidden_dim
        self.num_hiddens = num_hiddens = config.num_hiddens
        self.num_modes = num_modes = config.num_modes
        self.mode_dim = mode_dim = config.mode_dim
        self.cmt_seq_len = cmt_seq_len = config.cmt_seq_len
        self.reply_seq_len = reply_seq_len = config.reply_seq_len
        # Objective weight for reply language modeling.
        self.alpha = alpha = config.alpha

        # Initializes placeholders for inputs.
        self.comment_inputs = []
        self.comment_weights = []
        self.reply_inputs = []
        self.reply_weights = []

        self._lr = tf.Variable(0.0, trainable=False)

        for i in xrange(cmt_seq_len):
            self.comment_inputs.append(
                tf.placeholder(tf.int32,
                               name='comment_input_{0}'.format(i),
                               shape=[batch_size]))
            self.comment_weights.append(
                tf.placeholder(tf.float32,
                               name='comment_weight_{0}'.format(i),
                               shape=[batch_size]))
        for i in xrange(reply_seq_len):
            self.reply_inputs.append(
                tf.placeholder(tf.int32,
                               name='reply_input_{0}'.format(i),
                               shape=[batch_size]))
            self.reply_weights.append(
                tf.placeholder(tf.float32,
                               name='reply_weight_{0}'.format(i),
                               shape=[batch_size]))

        self.comment_embeds = []
        self.mix_mode_embeds = []
        self.mode_probs = []
        self.init_reply_embed = []

        # Initlize mode_rnn.
        if mode == 'TRAIN' and config.keep_prob < 1.0:
            mode_rnn = tf.nn.rnn_cell.MultiRNNCell([
                tf.nn.rnn_cell.DropoutWrapper(
                    tf.nn.rnn_cell.BasicLSTMCell(
                        hidden_dim, forget_bias=config.forget_bias,
                        state_is_tuple=True),
                    output_keep_prob=config.keep_prob)
                for _ in xrange(num_hiddens)], state_is_tuple=True)
        else:
            mode_rnn = tf.nn.rnn_cell.MultiRNNCell([
                tf.nn.rnn_cell.BasicLSTMCell(
                    hidden_dim, forget_bias=config.forget_bias,
                    state_is_tuple=True)
                for _ in xrange(num_hiddens)], state_is_tuple=True)

        # Defines the modes.
        batch_mode_inds = tf.constant([range(num_modes)
                                       for _ in range(batch_size)])

        # Defines the embeddings on CPU.
        with tf.device('/cpu:0'):
            mode_embedding = tf.get_variable(
                'mode_embedding',
                [num_modes, mode_dim], dtype=tf.float32)
            att_mode_vecs = tf.nn.embedding_lookup(
                mode_embedding, batch_mode_inds)
            att_states = tf.reshape(
                att_mode_vecs, [-1, num_modes, 1, mode_dim])

        att_mode_weight = tf.get_variable('att_mode_weight',
                                          [1, 1, mode_dim, hidden_dim])

        mode_feat = tf.nn.conv2d(
            att_states, att_mode_weight,
            [1, 1, 1, 1], 'SAME')
        att_v = tf.get_variable('att_v', [hidden_dim])

        def single_attention(query):
            with tf.variable_scope('attention_mlp'):
                y = linear(query, hidden_dim, True)
                y = tf.reshape(y, [-1, 1, 1, hidden_dim])
                s = tf.reduce_sum(att_v * tf.tanh(mode_feat + y), [2, 3])
                a_score = tf.nn.softmax(s)
                weighted_sum = tf.reduce_sum(
                    tf.reshape(a_score, [-1, num_modes, 1, 1]) * att_states,
                    [1, 2])
                a_score = tf.reshape(a_score, [-1, num_modes])
                weighted_sum = tf.reshape(weighted_sum, [-1, mode_dim])
            return a_score, weighted_sum

        with tf.device('/cpu:0'):
            if loaded_word_embed is None:
                embed_weight = tf.get_variable('word_embedding',
                                               [vocab_size, embed_dim])
            else:
                pretrain_word_embed = tf.constant(loaded_word_embed)
                embed_weight = tf.get_variable('word_embedding',
                                               initializer=pretrain_word_embed)

        cmt_state = mode_rnn.zero_state(batch_size, tf.float32)
        c_prev, cell_output = cmt_state[0]

        # Computes the residual value of content and global modes.
        att_proj_weight = tf.get_variable('att_proj_weight',
                                          [mode_dim, hidden_dim])
        att_probs, attns = single_attention(cell_output)
        cell_output += tf.matmul(attns, att_proj_weight)
        cmt_state = [tf.nn.rnn_cell.LSTMStateTuple(c_prev, cell_output)]

        mode_rnn_cell_output = []
        mode_probs = []
        lm_logits = []

        with tf.variable_scope('mode_rnn'):
            for i, cmt_in in enumerate(self.comment_inputs):
                if i > 0: tf.get_variable_scope().reuse_variables()
                cmt_embeds = tf.reshape(
                    tf.nn.embedding_lookup(embed_weight, cmt_in),
                    [batch_size, embed_dim])

                cell_output, cmt_state = mode_rnn(cmt_embeds, cmt_state)
                mode_rnn_cell_output.append(cell_output)
                att_probs, attns = single_attention(cell_output)

                c_prev, _ = cmt_state[0]
                cell_output += tf.matmul(attns, att_proj_weight)

                cmt_state = [tf.nn.rnn_cell.LSTMStateTuple(c_prev, cell_output)]

                with tf.variable_scope('attention_projection'):
                    attention_proj = linear(cell_output, vocab_size, True)

                lm_logits.append(attention_proj)
                mode_probs.append(att_probs)
                if mode == 'INFER':
                    self.mix_mode_embeds.append(attns)

        if mode == 'INFER':
            self.comment_embeds = mode_rnn_cell_output
            self.mode_probs = mode_probs

        top_states = [tf.reshape(e, [-1, 1, mode_rnn.output_size])
                      for e in mode_rnn_cell_output]
        states_for_reply_rnn = tf.concat(1, top_states)

        reply_embeds = [
            tf.reshape(tf.nn.embedding_lookup(embed_weight, reply_i),
                       [batch_size, embed_dim]) for reply_i in self.reply_inputs[:-1]]

        # Initlize reply_rnn.
        if mode == 'TRAIN' and config.keep_prob < 1.0:
            reply_rnn = tf.nn.rnn_cell.MultiRNNCell([
                tf.nn.rnn_cell.DropoutWrapper(
                    tf.nn.rnn_cell.BasicLSTMCell(
                        hidden_dim, forget_bias=config.forget_bias,
                        state_is_tuple=True),
                    output_keep_prob=config.keep_prob)
                for _ in xrange(num_hiddens)], state_is_tuple=True)
        else:
            reply_rnn = tf.nn.rnn_cell.MultiRNNCell([
                tf.nn.rnn_cell.BasicLSTMCell(
                    hidden_dim, forget_bias=config.forget_bias,
                    state_is_tuple=True)
                for _ in xrange(num_hiddens)], state_is_tuple=True)

        reply_rnn_output, reply_rnn_final_state = attention_decoder(
            reply_embeds, cmt_state, states_for_reply_rnn, reply_rnn)

        if mode == 'INFER':
            self.init_reply_embed = reply_rnn_output[0]

        # Computes the language model loss for the comment.
        comment_targets = [cc for cc in self.comment_inputs[1:]]
        lm_loss = tf.reduce_sum(sequence_loss_by_example(
            lm_logits[:-1], comment_targets, self.comment_weights[1:]))

        gen_logits = []
        with tf.variable_scope('gen_logit_projection'):
            for i, rnn_out in enumerate(reply_rnn_output):
                if i > 0: tf.get_variable_scope().reuse_variables()
                logits = linear(rnn_out, vocab_size, True)
                gen_logits.append(logits)

        # Computes the lanuage model loss for the reply.
        reply_targets = [tt for tt in self.reply_inputs[1:]]
        gen_loss = tf.reduce_sum(sequence_loss_by_example(
            gen_logits, reply_targets, self.reply_weights[1:]))

        loss = lm_loss + alpha * gen_loss
        self.total_loss = loss

        self.saver = tf.train.Saver(tf.all_variables())

        if mode != 'TRAIN':
            return

        tvars = tf.trainable_variables()
        grads = tf.gradients(loss, tvars)

        if config.opt_method == 'SGD':
            optimizer = tf.train.GradientDescentOptimizer(self._lr)
        elif config.opt_method == 'AdaDelta':
            optimizer = tf.train.AdadeltaOptimizer(self._lr)
        elif config.opt_method == 'Adam':
            optimizer = tf.train.AdamOptimizer(self._lr)
        else:
            ValueError('Unknown optimizer {}'.format(config.opt_method))
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Beispiel #7
0
    def build(self):
        print('Building model')
        self.embeddings = tf.Variable(
            tf.random_uniform([self.alphabet_size, self.embedd_dims]),
            name='embeddings')

        X_embedded = tf.gather(self.embeddings, self.Xs, name='embed_X')
        t_embedded = tf.gather(self.embeddings, self.ts_go, name='embed_t')

        with tf.variable_scope('split_X_inputs'):
            X_list = tf.split(
                split_dim=1,
                num_split=self.max_x_seq_len,
                value=X_embedded)

            X_list = [tf.squeeze(X) for X in X_list]

            [X.set_shape([None, self.embedd_dims]) for X in X_list]

        with tf.variable_scope('split_t_inputs'):
            t_list = tf.split(
                split_dim=1,
                num_split=self.max_t_seq_len,
                value=t_embedded)

            t_list = [tf.squeeze(t) for t in t_list]

            [t.set_shape([None, self.embedd_dims]) for t in t_list]

        with tf.variable_scope('dense_out'):
            W_out = tf.get_variable('W_out', [self.dec_units, self.alphabet_size])
            b_out = tf.get_variable('b_out', [self.alphabet_size])

        # char encoder
        char_cell = rnn_cell.GRUCell(self.char_enc_units)
        char_enc_outputs, char_enc_state = rnn.rnn(
            cell=char_cell,
            inputs=X_list,
            dtype=tf.float32,
            sequence_length=self.X_len,
            scope='rnn_char_encoder')

        # char2word
        char2word = tf.transpose(tf.pack(char_enc_outputs), perm=[1, 0, 2])
        char2word = _grid_gather(char2word, self.X_spaces)
        char2word = tf.unpack(tf.transpose(char2word, perm=[1, 0, 2]))

        [t.set_shape([None, self.char_enc_units]) for t in char2word]

        # word encoder
        word_cell = rnn_cell.GRUCell(self.word_enc_units)
        word_enc_outputs, word_enc_state = rnn.rnn(
            cell=word_cell,
            inputs=char2word,
            dtype=tf.float32,
            sequence_length=self.X_spaces_len,
            scope='rnn_word_encoder'
        )

        # The loop function provides inputs to the decoder:
        def decoder_loop_function(prev, i):
            def feedback_on():
                prev_1 = tf.matmul(prev, W_out) + b_out
                # feedback is on, so feed the decoder with the previous output
                return tf.gather(self.embeddings, tf.argmax(prev_1, 1))

            def feedback_off():
                # feedback is off, so just feed the decoder with t's
                return t_list[i]

            return tf.cond(self.feedback, feedback_on, feedback_off)

        # decoder
        att_states = tf.transpose(tf.pack(word_enc_outputs), perm=[1, 0, 2])
        dec_cell = rnn_cell.GRUCell(self.dec_units)
        dec_out, dec_state = seq2seq.attention_decoder(
            decoder_inputs=t_list,
            initial_state=word_enc_state,
            attention_states=att_states,
            cell=dec_cell,
            loop_function=decoder_loop_function,
            scope='attention_decoder'
        )

        self.out = []
        for d in dec_out:
            self.out.append(tf.matmul(d, W_out) + b_out)

        # for debugging network (should write this outside of build)
        out_packed = tf.pack(self.out)
        out_packed = tf.transpose(out_packed, perm=[1, 0, 2])
        self.out_tensor = out_packed

        # add TensorBoard summaries for all variables
        tf.contrib.layers.summarize_variables()
Beispiel #8
0
def decoder_rnn(conv_encoder, rnn_encoder, decoder_inputs, decoder_hidden, weigth_generation, 
                weigth_copy, n_steps, bias_generation, bias_copy, batch_size, keep_prob,
                defendant, embedding, sample_rate, lstm_layer=1, is_train=True):
    
    with tf.name_scope('decoder_rnn') as scope:
        
        lstm_cell = rnn_cell.BasicLSTMCell(decoder_hidden, forget_bias=1.0, state_is_tuple=True)

        if lstm_layer > 1:
            lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * lstm_layer)

        initial_state = lstm_cell.zero_state(batch_size, tf.float32)
        
        batch_decoder_inputs = tf.nn.embedding_lookup(embedding, decoder_inputs)
        batch_decoder_inputs = tf.transpose(batch_decoder_inputs, [1, 0, 2])
        batch_decoder_inputs = tf.unpack(batch_decoder_inputs)
        batch_decoder_inputs = [tf.concat(1, [batch_decoder_inputs[i], conv_encoder]) 
                                for i in range(len(batch_decoder_inputs))]


        one_hot = tf.one_hot(indices=tf.reverse(defendant, dims=[False, True]), 
                             depth=embedding.get_shape().as_list()[0], 
                             on_value=1., 
                             off_value=0.,
                             axis=-1)
        rnn_time_major = tf.transpose(rnn_encoder, [1, 0, 2])
        rnn_time_major = tf.unpack(rnn_time_major)
        rnn_encoder_temp = [tf.tanh(tf.nn.bias_add(tf.matmul(rnn_time_major[i], weigth_copy), bias_copy)) 
                            for i in range(len(rnn_time_major))]
        rnn_encoder_temp = tf.transpose(tf.pack(rnn_encoder_temp), [1, 0, 2])

        def copy_net(decoder_out):
            with tf.variable_scope('copy_net') as scope:
                decoder_out = tf.reshape(decoder_out, [-1, decoder_hidden, 1])
                source_prob = tf.batch_matmul(rnn_encoder_temp, decoder_out)
                source_prob = tf.reshape(source_prob, [-1, 1, source_prob.get_shape().as_list()[1]])
                voc_prob = tf.batch_matmul(source_prob, one_hot)
                voc_prob = tf.reshape(voc_prob, [-1, voc_prob.get_shape().as_list()[-1]])
                return voc_prob

        if is_train:
            def func(prev, i):

                #generation prob
                generation_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation), bias_generation)
                #copy prob
                copy_prob = copy_net(prev)
                #words prob
                words_prob = tf.add(generation_prob, copy_prob)
                
                
                sample = tf.argmax(words_prob, 1)
                prev_word = tf.nn.embedding_lookup(embedding, sample)
                prev_outputs = tf.concat(1, [prev_word, conv_encoder])


                # select from prev_outputs and ground truth
                prob =  tf.random_uniform(minval=0, maxval=1, shape=(batch_size,))
                mask = tf.cast(tf.greater(sample_rate, prob), tf.float32)
                mask = tf.expand_dims(mask, 1)
                mask = tf.tile(mask, [1, prev_outputs.get_shape().as_list()[-1]])

                next_input = mask * prev_outputs + (1 - mask) * batch_decoder_inputs[i]

                return next_input

            outputs, state = seq2seq.attention_decoder(decoder_inputs=batch_decoder_inputs, 
                                                       initial_state=initial_state,
                                                       attention_states=rnn_encoder, 
                                                       cell=lstm_cell,
                                                       num_heads=1, 
                                                       loop_function=func,
                                                       scope='rnn_decoder',
                                                       initial_state_attention=False)


        else:

            def func(prev, i):

                #generation prob
                generation_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation), bias_generation)
                #copy prob
                copy_prob = copy_net(prev)
                #words prob
                words_prob = tf.add(generation_prob, copy_prob)

                sample = tf.argmax(words_prob, 1)
                prev_word = tf.nn.embedding_lookup(embedding, sample)
                prev_outputs = tf.concat(1, [prev_word, conv_encoder])
                
                return prev_outputs

            outputs, state = seq2seq.attention_decoder(decoder_inputs=batch_decoder_inputs, 
                                                       initial_state=initial_state,
                                                       attention_states=rnn_encoder, 
                                                       cell=lstm_cell,
                                                       num_heads=1, 
                                                       loop_function=func,
                                                       scope='rnn_decoder',
                                                       initial_state_attention=False)
                                        

        outputs = tf.nn.dropout(outputs, keep_prob)
        outputs = tf.unpack(outputs)

        res = [0 for i in range(n_steps)]
        for i in range(len(outputs)):
            
            #generation prob
            generation_prob = tf.nn.bias_add(tf.matmul(outputs[i], weigth_generation), bias_generation)
            #copy prob
            copy_prob = copy_net(outputs[i])
            #words prob
            res[i] = tf.add(generation_prob, copy_prob)

        return res, state
def decoder_rnn(conv_encoder,
                rnn_encoder,
                decoder_inputs,
                decoder_hidden,
                weigth_generation,
                weigth_copy,
                n_steps,
                bias_generation,
                bias_copy,
                batch_size,
                keep_prob,
                defendant,
                embedding,
                sample_rate,
                lstm_layer=1,
                is_train=True):

    with tf.name_scope('decoder_rnn') as scope:

        lstm_cell = rnn_cell.BasicLSTMCell(decoder_hidden,
                                           forget_bias=1.0,
                                           state_is_tuple=True)

        if lstm_layer > 1:
            lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * lstm_layer)

        initial_state = lstm_cell.zero_state(batch_size, tf.float32)

        batch_decoder_inputs = tf.nn.embedding_lookup(embedding,
                                                      decoder_inputs)
        batch_decoder_inputs = tf.transpose(batch_decoder_inputs, [1, 0, 2])
        batch_decoder_inputs = tf.unpack(batch_decoder_inputs)
        batch_decoder_inputs = [
            tf.concat(1, [batch_decoder_inputs[i], conv_encoder])
            for i in range(len(batch_decoder_inputs))
        ]

        one_hot = tf.one_hot(indices=tf.reverse(defendant, dims=[False, True]),
                             depth=embedding.get_shape().as_list()[0],
                             on_value=1.,
                             off_value=0.,
                             axis=-1)
        rnn_time_major = tf.transpose(rnn_encoder, [1, 0, 2])
        rnn_time_major = tf.unpack(rnn_time_major)
        rnn_encoder_temp = [
            tf.tanh(
                tf.nn.bias_add(tf.matmul(rnn_time_major[i], weigth_copy),
                               bias_copy)) for i in range(len(rnn_time_major))
        ]
        rnn_encoder_temp = tf.transpose(tf.pack(rnn_encoder_temp), [1, 0, 2])

        def copy_net(decoder_out):
            with tf.variable_scope('copy_net') as scope:
                decoder_out = tf.reshape(decoder_out, [-1, decoder_hidden, 1])
                source_prob = tf.batch_matmul(rnn_encoder_temp, decoder_out)
                source_prob = tf.reshape(
                    source_prob,
                    [-1, 1, source_prob.get_shape().as_list()[1]])
                voc_prob = tf.batch_matmul(source_prob, one_hot)
                voc_prob = tf.reshape(
                    voc_prob, [-1, voc_prob.get_shape().as_list()[-1]])
                return voc_prob

        if is_train:

            def func(prev, i):

                #generation prob
                generation_prob = tf.nn.bias_add(
                    tf.matmul(prev, weigth_generation), bias_generation)
                #copy prob
                copy_prob = copy_net(prev)
                #words prob
                words_prob = tf.add(generation_prob, copy_prob)

                sample = tf.argmax(words_prob, 1)
                prev_word = tf.nn.embedding_lookup(embedding, sample)
                prev_outputs = tf.concat(1, [prev_word, conv_encoder])

                # select from prev_outputs and ground truth
                prob = tf.random_uniform(minval=0,
                                         maxval=1,
                                         shape=(batch_size, ))
                mask = tf.cast(tf.greater(sample_rate, prob), tf.float32)
                mask = tf.expand_dims(mask, 1)
                mask = tf.tile(mask,
                               [1, prev_outputs.get_shape().as_list()[-1]])

                next_input = mask * prev_outputs + (
                    1 - mask) * batch_decoder_inputs[i]

                return next_input

            outputs, state = seq2seq.attention_decoder(
                decoder_inputs=batch_decoder_inputs,
                initial_state=initial_state,
                attention_states=rnn_encoder,
                cell=lstm_cell,
                num_heads=1,
                loop_function=func,
                scope='rnn_decoder',
                initial_state_attention=False)

        else:

            def func(prev, i):

                #generation prob
                generation_prob = tf.nn.bias_add(
                    tf.matmul(prev, weigth_generation), bias_generation)
                #copy prob
                copy_prob = copy_net(prev)
                #words prob
                words_prob = tf.add(generation_prob, copy_prob)

                sample = tf.argmax(words_prob, 1)
                prev_word = tf.nn.embedding_lookup(embedding, sample)
                prev_outputs = tf.concat(1, [prev_word, conv_encoder])

                return prev_outputs

            outputs, state = seq2seq.attention_decoder(
                decoder_inputs=batch_decoder_inputs,
                initial_state=initial_state,
                attention_states=rnn_encoder,
                cell=lstm_cell,
                num_heads=1,
                loop_function=func,
                scope='rnn_decoder',
                initial_state_attention=False)

        outputs = tf.nn.dropout(outputs, keep_prob)
        outputs = tf.unpack(outputs)

        res = [0 for i in range(n_steps)]
        for i in range(len(outputs)):

            #generation prob
            generation_prob = tf.nn.bias_add(
                tf.matmul(outputs[i], weigth_generation), bias_generation)
            #copy prob
            copy_prob = copy_net(outputs[i])
            #words prob
            res[i] = tf.add(generation_prob, copy_prob)

        return res, state
Beispiel #10
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.rnncell == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.rnncell == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.rnncell == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            raise Exception("rnncell type not supported: {}".format(
                args.rnncell))

        cell = cell_fn(args.rnn_size)
        self.cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)
        self.input_data = tf.placeholder(tf.int32,
                                         [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32,
                                      [args.batch_size, args.seq_length])
        self.initial_state = self.cell.zero_state(args.batch_size, tf.float32)
        with tf.variable_scope('rnnlm'):
            softmax_w = build_weight([args.rnn_size, args.vocab_size],
                                     name='soft_w')
            softmax_b = build_weight([args.vocab_size], name='soft_b')
            word_embedding = build_weight(
                [args.vocab_size, args.embedding_size], name='word_embedding')
            inputs_list = tf.split(
                1, args.seq_length,
                tf.nn.embedding_lookup(word_embedding, self.input_data))
            inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        if not args.attention:
            outputs, last_state = seq2seq.rnn_decoder(
                inputs_list,
                self.initial_state,
                self.cell,
                loop_function=loop if infer else None,
                scope='rnnlm')
        else:
            self.attn_length = 5
            self.attn_size = 32
            self.attention_states = build_weight(
                [args.batch_size, self.attn_length, self.attn_size])
            outputs, last_state = seq2seq.attention_decoder(
                inputs_list,
                self.initial_state,
                self.attention_states,
                self.cell,
                loop_function=loop if infer else None,
                scope='rnnlm')

        self.final_state = last_state
        output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([args.batch_size * args.seq_length])], args.vocab_size)
        # average loss for each word of each timestep
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.lr = tf.Variable(0.0, trainable=False)
        self.var_trainable_op = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.cost, self.var_trainable_op), args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(
            zip(grads, self.var_trainable_op))
        self.initial_op = tf.global_variables_initializer()
        self.logfile = args.log_dir + str(
            datetime.datetime.strftime(datetime.datetime.now(),
                                       '%Y-%m-%d %H:%M:%S') + '.txt').replace(
                                           ' ', '').replace('/', '')
        self.var_op = tf.global_variables()
        self.saver = tf.train.Saver(self.var_op,
                                    max_to_keep=5,
                                    keep_checkpoint_every_n_hours=1)
Beispiel #11
0
def decoder_rnn(conv_encoder,
                rnn_encoder,
                decoder_inputs,
                decoder_hidden,
                weigth_generation,
                n_steps,
                bias_generation,
                batch_size,
                keep_prob,
                defendant,
                embedding,
                sample_rate,
                lstm_layer=1,
                is_train=True):

    with tf.name_scope('decoder_rnn') as scope:

        lstm_cell = rnn_cell.BasicLSTMCell(decoder_hidden,
                                           forget_bias=1.0,
                                           state_is_tuple=True)

        if lstm_layer > 1:
            lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * lstm_layer)

        initial_state = lstm_cell.zero_state(batch_size, tf.float32)

        batch_decoder_inputs = tf.nn.embedding_lookup(embedding,
                                                      decoder_inputs)
        batch_decoder_inputs = tf.transpose(batch_decoder_inputs, [1, 0, 2])
        batch_decoder_inputs = tf.unpack(batch_decoder_inputs)
        batch_decoder_inputs = [
            tf.concat(1, [batch_decoder_inputs[i], conv_encoder])
            for i in range(len(batch_decoder_inputs))
        ]

        if is_train:

            def func(prev, i):

                #words prob
                words_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation),
                                            bias_generation)

                sample = tf.argmax(words_prob, 1)
                prev_word = tf.nn.embedding_lookup(embedding, sample)
                prev_outputs = tf.concat(1, [prev_word, conv_encoder])

                # select from prev_outputs and ground truth
                prob = tf.random_uniform(minval=0,
                                         maxval=1,
                                         shape=(batch_size, ))
                mask = tf.cast(tf.greater(sample_rate, prob), tf.float32)
                mask = tf.expand_dims(mask, 1)
                mask = tf.tile(mask,
                               [1, prev_outputs.get_shape().as_list()[-1]])

                next_input = mask * prev_outputs + (
                    1 - mask) * batch_decoder_inputs[i]

                return next_input

            outputs, state = seq2seq.attention_decoder(
                decoder_inputs=batch_decoder_inputs,
                initial_state=initial_state,
                attention_states=rnn_encoder,
                cell=lstm_cell,
                num_heads=1,
                loop_function=func,
                scope='rnn_decoder',
                initial_state_attention=False)

        else:

            def func(prev, i):

                #words prob
                words_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation),
                                            bias_generation)

                sample = tf.argmax(words_prob, 1)
                prev_word = tf.nn.embedding_lookup(embedding, sample)
                prev_outputs = tf.concat(1, [prev_word, conv_encoder])

                return prev_outputs

            outputs, state = seq2seq.attention_decoder(
                decoder_inputs=batch_decoder_inputs,
                initial_state=initial_state,
                attention_states=rnn_encoder,
                cell=lstm_cell,
                num_heads=1,
                loop_function=func,
                scope='rnn_decoder',
                initial_state_attention=False)

        outputs = tf.nn.dropout(outputs, keep_prob)
        outputs = tf.unpack(outputs)

        res = [0 for i in range(n_steps)]
        for i in range(len(outputs)):
            #words prob
            res[i] = tf.nn.bias_add(tf.matmul(outputs[i], weigth_generation),
                                    bias_generation)

        return res, state
Beispiel #12
0
def embedding_attention_decoder(decoder_inputs,
                                initial_state,
                                attention_states,
                                cell,
                                num_symbols,
                                embedding_size,
                                num_heads=1,
                                output_size=None,
                                output_projection=None,
                                feed_previous=False,
                                update_embedding_for_previous=True,
                                dtype=dtypes.float32,
                                scope=None,
                                initial_state_attention=False,
                                embedding=None):
    """RNN decoder with embedding and attention and a pure-decoding option.

  Args:
    decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function.
    num_symbols: Integer, how many symbols come into the embedding.
    embedding_size: Integer, the length of the embedding vector for each symbol.
    num_heads: Number of attention heads that read from attention_states.
    output_size: Size of the output vectors; if None, use output_size.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [output_size x num_symbols] and B has shape
      [num_symbols]; if provided and feed_previous=True, each fed previous
      output will first be multiplied by W and added B.
    feed_previous: Boolean; if True, only the first of decoder_inputs will be
      used (the "GO" symbol), and all other decoder inputs will be generated by:
        next = embedding_lookup(embedding, argmax(previous_output)),
      In effect, this implements a greedy decoder. It can also be used
      during training to emulate http://arxiv.org/abs/1506.03099.
      If False, decoder_inputs are used as given (the standard decoder case).
    update_embedding_for_previous: Boolean; if False and feed_previous=True,
      only the embedding for the first symbol of decoder_inputs (the "GO"
      symbol) will be updated by back propagation. Embeddings for the symbols
      generated from the decoder itself remain unchanged. This parameter has
      no effect if feed_previous=False.
    dtype: The dtype to use for the RNN initial states (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing the generated outputs.
      state: The state of each decoder cell at the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: When output_projection has the wrong shape.
  """
    if output_size is None:
        output_size = cell.output_size
    if output_projection is not None:
        proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
        proj_biases.get_shape().assert_is_compatible_with([num_symbols])

    with variable_scope.variable_scope(scope or "embedding_attention_decoder"):
        #with ops.device("/cpu:0"):
        #  embedding = variable_scope.get_variable("embedding",
        #                                          [num_symbols, embedding_size])
        loop_function = _extract_sample_and_embed(
            embedding, output_projection,
            update_embedding_for_previous) if feed_previous else None
        emb_inp = [
            embedding_ops.embedding_lookup(embedding, i)
            for i in decoder_inputs
        ]
        return attention_decoder(
            emb_inp,
            initial_state,
            attention_states,
            cell,
            output_size=output_size,
            num_heads=num_heads,
            loop_function=loop_function,
            initial_state_attention=initial_state_attention)
def embedding_attention_decoder(decoder_inputs, initial_state, attention_states,
                                cell, num_symbols, embedding_size, num_heads=1,
                                output_size=None, output_projection=None,
                                feed_previous=False,
                                update_embedding_for_previous=True,
                                dtype=dtypes.float32, scope=None,
                                initial_state_attention=False,
                                embedding=None):
  """RNN decoder with embedding and attention and a pure-decoding option.

  Args:
    decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function.
    num_symbols: Integer, how many symbols come into the embedding.
    embedding_size: Integer, the length of the embedding vector for each symbol.
    num_heads: Number of attention heads that read from attention_states.
    output_size: Size of the output vectors; if None, use output_size.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [output_size x num_symbols] and B has shape
      [num_symbols]; if provided and feed_previous=True, each fed previous
      output will first be multiplied by W and added B.
    feed_previous: Boolean; if True, only the first of decoder_inputs will be
      used (the "GO" symbol), and all other decoder inputs will be generated by:
        next = embedding_lookup(embedding, argmax(previous_output)),
      In effect, this implements a greedy decoder. It can also be used
      during training to emulate http://arxiv.org/abs/1506.03099.
      If False, decoder_inputs are used as given (the standard decoder case).
    update_embedding_for_previous: Boolean; if False and feed_previous=True,
      only the embedding for the first symbol of decoder_inputs (the "GO"
      symbol) will be updated by back propagation. Embeddings for the symbols
      generated from the decoder itself remain unchanged. This parameter has
      no effect if feed_previous=False.
    dtype: The dtype to use for the RNN initial states (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing the generated outputs.
      state: The state of each decoder cell at the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: When output_projection has the wrong shape.
  """
  if output_size is None:
    output_size = cell.output_size
  if output_projection is not None:
    proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
    proj_biases.get_shape().assert_is_compatible_with([num_symbols])

  with variable_scope.variable_scope(scope or "embedding_attention_decoder"):
    #with ops.device("/cpu:0"):
    #  embedding = variable_scope.get_variable("embedding",
    #                                          [num_symbols, embedding_size])
    loop_function = _extract_sample_and_embed(
        embedding, output_projection,
        update_embedding_for_previous) if feed_previous else None
    emb_inp = [
        embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs]
    return attention_decoder(
        emb_inp, initial_state, attention_states, cell, output_size=output_size,
        num_heads=num_heads, loop_function=loop_function,
        initial_state_attention=initial_state_attention)
Beispiel #14
0
    def build(self):
        print('Building model')
        self.embeddings = tf.Variable(tf.random_uniform(
            [self.alphabet_size, self.embedd_dims]),
                                      name='embeddings')

        X_embedded = tf.gather(self.embeddings, self.Xs, name='embed_X')
        t_embedded = tf.gather(self.embeddings, self.ts_go, name='embed_t')

        with tf.variable_scope('split_X_inputs'):
            X_list = tf.split(split_dim=1,
                              num_split=self.max_x_seq_len,
                              value=X_embedded)

            X_list = [tf.squeeze(X) for X in X_list]

            [X.set_shape([None, self.embedd_dims]) for X in X_list]

        with tf.variable_scope('split_t_inputs'):
            t_list = tf.split(split_dim=1,
                              num_split=self.max_t_seq_len,
                              value=t_embedded)

            t_list = [tf.squeeze(t) for t in t_list]

            [t.set_shape([None, self.embedd_dims]) for t in t_list]

        with tf.variable_scope('dense_out'):
            W_out = tf.get_variable('W_out',
                                    [self.dec_units, self.alphabet_size])
            b_out = tf.get_variable('b_out', [self.alphabet_size])

        # char encoder
        char_cell = rnn_cell.GRUCell(self.char_enc_units)
        char_enc_outputs, char_enc_state = rnn.rnn(cell=char_cell,
                                                   inputs=X_list,
                                                   dtype=tf.float32,
                                                   sequence_length=self.X_len,
                                                   scope='rnn_char_encoder')

        # char2word
        char2word = tf.transpose(tf.pack(char_enc_outputs), perm=[1, 0, 2])
        char2word = _grid_gather(char2word, self.X_spaces)
        char2word = tf.unpack(tf.transpose(char2word, perm=[1, 0, 2]))

        [t.set_shape([None, self.char_enc_units]) for t in char2word]

        # word encoder
        word_cell = rnn_cell.GRUCell(self.word_enc_units)
        word_enc_outputs, word_enc_state = rnn.rnn(
            cell=word_cell,
            inputs=char2word,
            dtype=tf.float32,
            sequence_length=self.X_spaces_len,
            scope='rnn_word_encoder')

        # The loop function provides inputs to the decoder:
        def decoder_loop_function(prev, i):
            def feedback_on():
                prev_1 = tf.matmul(prev, W_out) + b_out
                # feedback is on, so feed the decoder with the previous output
                return tf.gather(self.embeddings, tf.argmax(prev_1, 1))

            def feedback_off():
                # feedback is off, so just feed the decoder with t's
                return t_list[i]

            return tf.cond(self.feedback, feedback_on, feedback_off)

        # decoder
        att_states = tf.transpose(tf.pack(word_enc_outputs), perm=[1, 0, 2])
        dec_cell = rnn_cell.GRUCell(self.dec_units)
        dec_out, dec_state = seq2seq.attention_decoder(
            decoder_inputs=t_list,
            initial_state=word_enc_state,
            attention_states=att_states,
            cell=dec_cell,
            loop_function=decoder_loop_function,
            scope='attention_decoder')

        self.out = []
        for d in dec_out:
            self.out.append(tf.matmul(d, W_out) + b_out)

        # for debugging network (should write this outside of build)
        out_packed = tf.pack(self.out)
        out_packed = tf.transpose(out_packed, perm=[1, 0, 2])
        self.out_tensor = out_packed

        # add TensorBoard summaries for all variables
        tf.contrib.layers.summarize_variables()