Esempio n. 1
0
 def _encoder_init(self):
     with tf.name_scope('encoder'):
         encoder_cell = create_rnn_cell(self.unit_type, self.num_units,
                                        self.num_layers, self.keep_prob)
         encoder_init_state = encoder_cell.zero_state(
             self.batch_size, tf.float32)
         self.encoder_outputs, self.encoder_state = tf.nn.dynamic_rnn(
             encoder_cell,
             self.encoder_emb_inp,
             sequence_length=self.encoder_input_lengths,
             time_major=self.time_major,
             initial_state=encoder_init_state)
Esempio n. 2
0
    def _encoder_init(self):
        with tf.name_scope('encoder'):
            if self.encoder_type == 'uni':
                encoder_cell = create_rnn_cell(
                    self.unit_type,
                    self.num_units,  # #num_units ”门“中的隐藏神经元个数
                    self.num_layers,  # Network depth RNN隐藏层深度
                    self.keep_prob)  #for dropout
                encoder_init_state = encoder_cell.zero_state(  #encoder_init_state.shape = [batch_size, state_size], filled with zeros
                    self.batch_size, tf.float32)

                # dynamic动态的RNN,通过循环动态构建网络,不需指定时序长度
                # encoder_state is N-tuple( N是时序长度 ),包含每个LSTMcell的 LSTMStateTuple
                # encoder_outputs.shape [batch_size, max_time, num_units(最后时间步的隐层unit_num)]
                self.encoder_outputs, self.encoder_state = tf.nn.dynamic_rnn(
                    encoder_cell,
                    self.encoder_emb_inp,
                    #sequence_length: 1-D 用来指定每个句子的有效长度(除去PAD) 超出的部分直接复制最后一个有效状态,并输出零向量
                    sequence_length=self.encoder_input_lengths,
                    time_major=self.
                    time_major,  #输入输出tensor格式,如果真,必须为[max_time, batch_size, depth],否则[batch_size, max_time, depth]
                    initial_state=encoder_init_state
                )  #RNN初始状态,如果cell.state_size是整数,则必须形状是[batch_size, cell.state_size]的Tensor
            elif self.encoder_type == 'bi':
                #num_bi_layers = 1
                num_bi_layers = self.num_layers // 2
                fw_cell = create_rnn_cell(
                    self.unit_type,
                    self.num_units,
                    num_bi_layers,  # Network depth RNN隐藏层深度
                    self.keep_prob)  #for dropout
                bw_cell = create_rnn_cell(
                    self.unit_type,
                    self.num_units,
                    num_bi_layers,  # Network depth RNN隐藏层深度
                    self.keep_prob)  # for dropout
                bw_init_state = bw_cell.zero_state(
                    # encoder_init_state.shape = [batch_size, state_size], filled with zeros
                    self.batch_size,
                    tf.float32)
                fw_init_state = fw_cell.zero_state(
                    # encoder_init_state.shape = [batch_size, state_size], filled with zeros
                    self.batch_size,
                    tf.float32)

                #outputs:包含前向和后向rnn输出的元组(output_fw,output_bw)Tensor
                #output_states:包含双向rnn的前向和后向最终状态的元组(output_state_fw,output_state_bw)
                bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(
                    fw_cell,
                    bw_cell,
                    self.encoder_emb_inp,
                    sequence_length=self.encoder_input_lengths,
                    time_major=self.time_major,
                    initial_state_fw=fw_init_state,
                    initial_state_bw=bw_init_state)

                #print('bi_outputs',list(bi_outputs))
                self.encoder_outputs = tf.concat(bi_outputs, -1)

                #调整结构对接encoder,decoder隐藏状态
                bi_encoder_state = (bi_state[0][0], bi_state[1][0])
                #print('bi_state[0][0]',bi_state[0][0])
                #print('bi_state[1][0]', bi_state[1][0])
                #print('bi_new', (bi_state[0][0],bi_state[1][0]))

                if num_bi_layers == 1:
                    self.encoder_state = bi_encoder_state
                else:
                    encoder_state = []
                    for layer_id in range(num_bi_layers):
                        encoder_state.append(bi_encoder_state[0][layer_id])
                        encoder_state.append(bi_encoder_state[1][layer_id])
                        #print('encoder_state',encoder_state)
                    self.encoder_state = tuple(encoder_state)
            else:
                raise ValueError('Unknown encoder_type %s' % self.encoder_type)
Esempio n. 3
0
    def _decoder_init(self):

        if self.time_major == True:
            memory = tf.transpose(self.encoder_outputs, [
                1, 0, 2
            ])  #[max_time, batch_size, depth] -> [batch_size, max_time, depth]
        else:
            memory = self.encoder_outputs  #[batch_size, max_time, depth]

        #print('self.infer_mode[0]',self.infer_mode[0])

        if self.mode == 'infer' and self.infer_mode[0] == 'beam_search':
            memory = tf.contrib.seq2seq.tile_batch(memory,
                                                   multiplier=self.beam_width)
            encoder_input_lengths = tf.contrib.seq2seq.tile_batch(
                self.encoder_input_lengths, multiplier=self.beam_width)
            encoder_state = tf.contrib.seq2seq.tile_batch(
                self.encoder_state, multiplier=self.beam_width)
            batch_size = self.batch_size * self.beam_width

        else:
            batch_size = self.batch_size
            encoder_input_lengths = self.encoder_input_lengths
            encoder_state = self.encoder_state

        dtype = tf.float32

        with tf.name_scope('decoder'):
            cell = create_rnn_cell(self.unit_type, self.num_units,
                                   self.num_layers, self.keep_prob)
            attention_mechanism = attention_mechanism_fn(
                self.attention_type, self.num_units, memory,
                encoder_input_lengths)

            #alignment_history之后用于可视化attention
            alignment_history = (self.mode == 'infer'
                                 and self.infer_mode[0] != "beam_search")

            cell = tf.contrib.seq2seq.AttentionWrapper(
                cell,
                attention_mechanism,
                alignment_history=alignment_history,
                attention_layer_size=self.
                num_units  #注意(输出)层的深度,不为None时,将上下文向量和单元输出送到关注层以在每个时间步产生注意力
            )
            '''!!!'''
            init_state = cell.zero_state(batch_size,
                                         dtype).clone(cell_state=encoder_state)
            #print('init_state',init_state)
            #print('encoder_state',encoder_state)

            projection_layer = Dense(self.decoder_vocab_size, use_bias=False)

        if self.mode == 'train':
            train_helper = tf.contrib.seq2seq.TrainingHelper(  #帮助建立Decoder,只在训练时使用
                inputs=self.decoder_emb_inp,
                sequence_length=self.decoder_input_lengths,
                time_major=True)
            train_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell,
                train_helper,
                init_state,
                output_layer=projection_layer  #应用于RNN输出的层 (Dense)
            )
            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(  #return (final_outputs, final_state, final_sequence_lengths)
                train_decoder,
                output_time_major=True,
                swap_memory=True  #swap交换,是否为此循环启用GPU-CPU内存交换
            )

            #logits = Dense(self.decoder_vocab_size, use_bias=False)(outputs.rnn_output)
            logits = outputs.rnn_output

            self.preds = tf.to_int32(tf.argmax(logits, axis=-1))
            self.istarget = tf.to_float(tf.not_equal(self.decoder_targets, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.decoder_targets)) *
                self.istarget) / (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)

            #print('logits',logits)
            #print('self.decoder_targets',self.decoder_targets)
            with tf.name_scope('optimizer'):
                # loss
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(  #sparse 稀疏
                    labels=self.decoder_targets, logits=logits)
                self.cost = tf.reduce_sum((loss * self.mask) / tf.to_float(
                    self.batch_size))  #loss*mask 只算target位置loss,不算<PAD>
                tf.summary.scalar('loss', self.cost)
                # learning_rate decay
                self.global_step = tf.Variable(0)
                self.learning_rate = tf.train.exponential_decay(  #将指数衰减应用于学习率
                    self.
                    learning_rate,  #decayed_learning_rate = learning_rate *
                    #   decay_rate ^ (global_step / decay_steps)
                    self.global_step,
                    self.decay_steps,
                    self.decay_rate,
                    staircase=True
                )  #如果为True,那么global_step / decay_steps是整数除法,并且衰减学习率遵循阶梯函数

                # clip_by_global_norm 梯度裁剪
                self.trainable_variables = tf.trainable_variables(
                )  #tf.trainable_variables返回图中所有trainable=True的变量
                self.grads, _ = tf.clip_by_global_norm(  #tf.clip_by_global_norm(t_list, clip_norm)
                    #t_list[i] * clip_norm / max(global_norm, clip_norm)
                    #global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))
                    tf.gradients(self.cost, self.trainable_variables
                                 ),  #tf.gradients(y,x)返回len(x)的tensor列表,
                    #返回列表第i个tensor是y对列表x第i个值求导的值
                    self.max_gradient_norm)

                # OPTIMIZE: adam | sgd
                if self.optimizer == 'adam':
                    opt = tf.train.AdamOptimizer(self.learning_rate)
                elif self.optimizer == 'sgd':
                    opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                else:
                    raise ValueError('unkown optimizer %s' % self.optimizer)

                self.update = opt.apply_gradients(
                    zip(self.grads,
                        self.trainable_variables),  #zip 对应元素打包成二元组,返回元组列表
                    global_step=self.global_step)

        elif self.mode == 'infer':
            #print('batch_size',batch_size)
            '''此处start_tokens维度为 self.batch_size'''
            start_tokens = tf.ones([
                self.batch_size,
            ], tf.int32) * 1  #1 : idx of <GO>
            end_token = 2  #2 : idx of <EOS>
            infer_mode = self.infer_mode[0]

            if infer_mode == 'greedy':
                infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(  #使用输出logits的argmax并将结果传递给嵌入层以获取下一个输入
                    embedding=self.decoder_embedding,
                    start_tokens=start_tokens,
                    end_token=end_token)
                infer_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell,
                    infer_helper,
                    init_state,
                    output_layer=projection_layer  #应用于RNN每个时间步的(Dense)
                )
            elif infer_mode == 'beam_search':
                infer_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=cell,
                    embedding=self.decoder_embedding,
                    start_tokens=start_tokens,
                    end_token=end_token,
                    initial_state=init_state,
                    beam_width=self.beam_width,
                    output_layer=projection_layer)
            else:
                raise ValueError('unkown infer mode %s' % infer_mode)

            decoder_outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=infer_decoder, maximum_iterations=50)  #允许的最大解码步数

            #print('final_context_state : ',final_context_state)

            # decoder_outputs是一个namedtuple,里面包含两项(rnn_outputs, sample_id)
            # rnn_outputs: [batch_size, decoder_targets_length, vocab_size]
            # sample_id: [batch_size] 保存最后的编码结果,可以表示最后的答案

            if infer_mode == 'greedy':
                # [decoder_steps, batch_size, encoder_steps]
                self.inference_attention_matrices = final_context_state.alignment_history.stack(
                    name="inference_attention_matrix")

                self.translations = decoder_outputs.sample_id

                logits = decoder_outputs.rnn_output
                #print('logits',logits)

            elif infer_mode == 'beam_search':
                # = decoder_outputs.predicted_ids
                #self.translations = tf.reduce_sum(translations,-1)
                self.translations = decoder_outputs.predicted_ids
                logits = tf.no_op()
                #print('logits', logits)

            else:
                raise ValueError('unkown infer mode %s' % infer_mode)
Esempio n. 4
0
    def build_graph(self):
        print('#' * 48 + '\n \n Begin to build Seq2Seq model \n \n' + '#' * 48)

        ## add placeholders
        batch_size = tf.placeholder(tf.int32, [])
        self.batch_size = batch_size
        # self.encoder_sources = tf.placeholder(tf.int32, [self.batch_size, None])
        encoder_sources = tf.placeholder(tf.int32, [None, None])
        self.encoder_sources = encoder_sources
        decoder_targets = tf.placeholder(tf.int32, [None, None])
        self.decoder_targets = decoder_targets

        sources_length_list = tf.placeholder(tf.int32, [None])
        self.sources_length_list = sources_length_list

        targets_length_list = tf.placeholder(tf.int32, [None])
        self.targets_length_list = targets_length_list

        # build encoder
        with tf.variable_scope('Encoder'):
            # since our seq2seq in chatbot locates in monolingual corpus,
            # source and target sequences share the embeddings matrix
            embedding = tf.get_variable(
                name='embedding_matrix',
                shape=[self.vocab_size, self.encoder_embedding_size])
            encoder_cell = create_rnn_cell(self.encoder_rnn_type,
                                           self.encoder_num_layers,
                                           self.encoder_rnn_size,
                                           self.output_keep_prob)
            ## encoder_embedding_output: batch_size * source_length * embedding_size
            embedded_encoder_input = tf.nn.embedding_lookup(
                embedding, self.encoder_sources)
            # encoder_outputs: [batch_size, source_length, embedding_size], encoder_state: [batch_size, embedding_size]
            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                cell=encoder_cell,
                inputs=embedded_encoder_input,
                dtype=tf.float32)

        # build decoder
        with tf.variable_scope('Decoder'):
            attention_mechanism = create_attention_mechanism(
                self.attention_type, self.encoder_rnn_size, encoder_outputs)
            # decoder cell
            decoder_cell = create_rnn_cell(self.decoder_rnn_type,
                                           self.decoder_num_layers,
                                           self.decoder_rnn_size,
                                           self.output_keep_prob)
            attention_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell=decoder_cell, attention_mechanism=attention_mechanism)

            ## self.mode == 'train':
            # concate
            decoder_input = tf.concat([
                tf.fill([self.batch_size, 1], self.word2idx['<go>']),
                self.decoder_targets
            ], 1)
            ## delete the end symbol such as <EOS>
            decoder_input = tf.strided_slice(input_=decoder_input,
                                             begin=[0, 0],
                                             end=[self.batch_size, -1],
                                             strides=[1, 1])
            embedded_decoder_input = tf.nn.embedding_lookup(
                embedding, decoder_input)

            training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=embedded_decoder_input,
                sequence_length=self.targets_length_list)
            projection_layer = tf.layers.Dense(units=self.vocab_size)

            ## judge initial state of the decoder
            if self.use_attention:
                decoder_initial_state = attention_decoder_cell.zero_state(
                    batch_size=self.batch_size,
                    dtype=tf.float32).clone(cell_state=encoder_state)
            else:
                decoder_initial_state = encoder_state

            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=attention_decoder_cell,
                helper=training_helper,
                initial_state=decoder_initial_state,
                output_layer=projection_layer)

            # decoder_outputs (rnn_output: [batch_size, targets_length, vocab_size],
            # sample_id: [batch_size] this is final output of decoder)
            decoder_outputs, decoder_state, decoder_sequence_length = \
                        tf.contrib.seq2seq.dynamic_decode(training_decoder)
            decoder_logits = tf.identity(decoder_outputs.rnn_output)

            # mask:[batch_size, max_decoder_targets_length], mask is used to calculate the loss function:
            # for pedding position, the weight of this position loss is zero
            mask = tf.sequence_mask(self.targets_length_list)
            mask = tf.cast(mask, tf.float32)
            # decoder_logits: [batch_size, target_sequence_length, vocal_size],
            # targets: [batch_size, target_sequence_length]
            self.loss = tf.contrib.seq2seq.sequence_loss(
                logits=decoder_logits,
                targets=self.decoder_targets,
                weights=mask)

            optimizer = tf.train.AdamOptimizer(self.learning_rate)

            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            clip_gradients, _ = tf.clip_by_global_norm(gradients,
                                                       self.clip_norm)
            self.train_op = optimizer.apply_gradients(
                zip(clip_gradients, params))

            ## **** inference stage **** ##
            # start_tokens = tf.constant(value=self.word2idx['<go>'], shape=[self.batch_size])
            start_tokens = tf.cast(
                tf.ones([self.batch_size]) * self.word2idx['<go>'], tf.int32)
            end_token = self.word2idx['<eos>']
            greedy_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding=embedding,
                start_tokens=start_tokens,
                end_token=end_token)

            inference_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=attention_decoder_cell,
                helper=greedy_helper,
                initial_state=decoder_initial_state,
                output_layer=projection_layer)
            # inference_outputs: (rnn_output : [batch_size, decoder_targets_length, vocab_size],
            #                     sample_id  : [batch_size, decoder_targets_length] )
            inference_decode_outputs, inference_final_state, inference_final_sequence_length = \
                        tf.contrib.seq2seq.dynamic_decode(decoder=inference_decoder, maximum_iterations=20)

            self.inference_outputs = inference_decode_outputs.sample_id
Esempio n. 5
0
    def _decoder_init(self):

        if self.time_major == True:
            memory = tf.transpose(self.encoder_outputs, [1, 0, 2])
        else:
            memory = self.encoder_outputs

        with tf.name_scope('decoder'):
            cell = create_rnn_cell(self.unit_type, self.num_units,
                                   self.num_layers, self.keep_prob)
            attention_mechanism = attention_mechanism_fn(
                self.attention_type, self.num_units, memory,
                self.encoder_input_lengths)
            cell = tf.contrib.seq2seq.AttentionWrapper(
                cell, attention_mechanism, attention_layer_size=self.num_units)
            init_state = cell.zero_state(
                self.batch_size,
                tf.float32).clone(cell_state=self.encoder_state)
            projection_layer = Dense(self.decoder_vocab_size, use_bias=False)

        if self.mode == 'train':
            train_helper = tf.contrib.seq2seq.TrainingHelper(
                self.decoder_emb_inp,
                self.decoder_input_lengths,
                time_major=True)
            train_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell, train_helper, init_state, output_layer=projection_layer)
            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                train_decoder, output_time_major=True, swap_memory=True)
            logits = outputs.rnn_output

            with tf.name_scope('optimizer'):
                # loss
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.decoder_targets, logits=logits)
                self.cost = tf.reduce_sum((loss * self.mask) / self.batch_size)
                tf.summary.scalar('loss', self.cost)
                # learning_rate decay
                self.global_step = tf.Variable(0)
                self.learning_rate = tf.train.exponential_decay(
                    self.learning_rate,
                    self.global_step,
                    self.decay_steps,
                    self.decay_rate,
                    staircase=True)
                # clip_by_global_norm
                self.trainable_variables = tf.trainable_variables()
                self.grads, _ = tf.clip_by_global_norm(
                    tf.gradients(self.cost, self.trainable_variables),
                    self.max_gradient_norm)
                # OPTIMIZE: adam | sgd
                if self.optimizer == 'adam':
                    opt = tf.train.AdamOptimizer(self.learning_rate)
                elif self.optimizer == 'sgd':
                    opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                else:
                    raise ValueError('unkown optimizer %s' % self.optimizer)

                self.update = opt.apply_gradients(zip(
                    self.grads, self.trainable_variables),
                                                  global_step=self.global_step)

        elif self.mode == 'infer':
            start_tokens = tf.ones([
                self.batch_size,
            ], tf.int32) * 1
            end_token = 2
            if self.infer_mode == 'greedy':
                infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    embedding=self.decoder_embedding,
                    start_tokens=start_tokens,
                    end_token=end_token)
                infer_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell,
                    infer_helper,
                    init_state,
                    output_layer=projection_layer)
            elif self.infer_mode == 'beam_search':
                infer_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=cell,
                    embedding=self.decoder_embedding,
                    start_tokens=start_tokens,
                    end_token=end_token,
                    initial_state=init_state,
                    beam_width=self.beam_width,
                    output_layer=projection_layer)
            else:
                raise ValueError('unkown infer mode %s' % self.infer_mode)

            decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=infer_decoder, maximum_iterations=50)
            self.translations = decoder_outputs.sample_id