Ejemplo n.º 1
0
        def energy_step(decode_outs, states):  # decode_outs(batch,dim)
            decode_outs = _p(decode_outs,
                             "energy_step:decode_outs 算能量函数了.........."
                             )  #decode_outs:[1,20]
            # decoder_seq [N,30,512] 30是字符串长度
            en_seq_len, en_hidden = encoder_out_seq.shape[
                1], encoder_out_seq.shape[2]  # 30, 512
            de_hidden = decode_outs.shape[-1]
            #  W * h_j
            reshaped_enc_outputs = K.reshape(
                encoder_out_seq, (-1, en_hidden))  #[b,64,512]=> [b*64,512]
            _p(reshaped_enc_outputs, "reshaped_enc_outputs")

            # W_a[512x512],reshaped_enc_outputs[b*64,512] => [b*64,512] => [b,64,512]
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a),
                                  (-1, en_seq_len, en_hidden))
            # U * S_t - 1,decode_outs[b,512],U_a[512,512] => [b,512]    => [b,1,512]
            U_a_dot_h = K.expand_dims(K.dot(decode_outs, self.U_a),
                                      axis=1)  # <= batch_size, 1, latent_dim

            # 这个细节很变态,其实就是完成了decoder的输出复制time(64)个,和encoder的输出【64,512】,相加的过程

            # tanh ( W * h_j + U * S_t-1 + b ),[b,64,512] = [b*64,512]
            reshaped_Ws_plus_Uh = K.tanh(
                K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            # V * tanh ( W * h_j + U * S_t-1 + b ), [b*64,512]*[512,1] => [b*64,1] => [b,64]
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a),
                            (-1, en_seq_len))
            # softmax(e_tj)
            e_i = K.softmax(e_i)
            e_i = _p(e_i, "energy_step:e_i")
            return e_i, [e_i]
Ejemplo n.º 2
0
 def context_step(e, states):  # e (batch,dim),其实每个输入就是一个e
     e = _p(e, "context_step:e")
     states = _p(states, "context_step:states")
     # encoder_out_seq[b,64,512] * e[64,1]
     # dot是矩阵相乘,*是对应位置元素相乘
     # [b,64,512] * e[64,1]shape不一样,居然也可以乘,我试了,没问题
     # 其实,就是实现了encoder ouput根据softmax概率分布,加权求和
     c_i = K.sum(encoder_out_seq * K.expand_dims(e, -1), axis=1)
     c_i = _p(c_i,
              "context_step:c_i,算h的期望,也就是注意力了---------------------\n")
     return c_i, [c_i]
Ejemplo n.º 3
0
def words_accuracy(y_true, y_pred):
    # logger.debug("DEBUG@@@,看看y_pred的shape:%r",K.int_shape(y_pred))
    # 调试结果是======>(None, None, 3864)
    # 第一个维度是batch,第三个维度是词表长度,那第二个维度呢?
    #
    # y_pred = _p(y_pred,"DEBUG@@@,运行态的时候的words_accuracy的入参y_pred的shape")
    # 运行态的时候的words_accuracy的入参y_pred的shape[2 29 3864]
    # 所以y_pred[batch,seq_len,vocabulary_size]
    # 经调试,没问题

    max_idx_p = tf.argmax(y_pred, axis=2)
    max_idx_l = tf.argmax(y_true, axis=2)
    max_idx_p = _p(max_idx_p, "@@@,预测值")
    max_idx_l = _p(max_idx_l, "@@@,标签值")
    correct_pred = tf.equal(max_idx_p, max_idx_l)
    correct_pred = _p(correct_pred, "@@@,words_accuracy(字对字)")
    _result = tf.map_fn(fn=lambda e: tf.reduce_all(e),
                        elems=correct_pred,
                        dtype=tf.bool)
    _result = _p(_result, "@@@,words_accuracy(词对词)")
    result = tf.reduce_mean(tf.cast(_result, tf.float32))
    result = _p(result, "@@@,words_accuracy正确率")
    return result
Ejemplo n.º 4
0
def train_model(conf, args):

    conv, input_image = Conv().build()

    encoder_bi_gru = Bidirectional(GRU(conf.GRU_HIDDEN_SIZE,
                                       return_sequences=True,
                                       return_state=True,
                                       name='encoder_gru'),
                                   name='bidirectional_encoder')

    # TODO:想不通如何实现2个bi-GRU堆叠,作罢,先继续,未来再回过头来考虑
    # encoder_bi_gru2 = Bidirectional(GRU(conf.GRU_HIDDEN_SIZE,
    #                                    return_sequences=True,
    #                                    return_state=True,
    #                                    name='encoder_gru'),
    #                                input_shape=( int(conf.INPUT_IMAGE_WIDTH/4) ,512),
    #                                name='bidirectional_encoder')

    encoder_out, encoder_fwd_state, encoder_back_state = encoder_bi_gru(conv)
    encoder_fwd_state = _p(encoder_fwd_state,
                           "编码器输出Fwd状态%%%%%%%%%%%%%%%%%%%%%%%%%%%")
    encoder_back_state = _p(encoder_back_state,
                            "编码器输出Back状态%%%%%%%%%%%%%%%%%%%%%%%%%%%")

    decoder_inputs = Input(shape=(None, conf.CHARSET_SIZE),
                           name='decoder_inputs')
    decoder_gru = GRU(units=conf.GRU_HIDDEN_SIZE * 2,
                      return_sequences=True,
                      return_state=True,
                      name='decoder_gru')
    decoder_initial_status = Concatenate(axis=-1)(
        [encoder_fwd_state, encoder_back_state])
    decoder_out, decoder_state = decoder_gru(
        decoder_inputs, initial_state=decoder_initial_status)

    attn_layer = AttentionLayer(name='attention_layer')
    logger.debug("模型Attention调用的张量[encoder_out, decoder_out]:%r,%r",
                 encoder_out, decoder_out)
    attn_out, attn_states = attn_layer([encoder_out,
                                        decoder_out])  # c_outputs, e_outputs

    decoder_concat_input = Concatenate(
        axis=-1, name='concat_layer')([decoder_out, attn_out])
    dense = Dense(conf.CHARSET_SIZE,
                  activation='softmax',
                  name='softmax_layer')
    dense_time = TimeDistributed(dense, name='time_distributed_layer')

    # decoder_concat_input = _p(decoder_concat_input, "编码器输出所有的状态s%%%%%%%%%%%%%%%%%%%%%%%%%%%")
    decoder_prob = dense_time(decoder_concat_input)

    train_model = Model(inputs=[input_image, decoder_inputs],
                        outputs=decoder_prob)
    opt = Adam(lr=args.learning_rate)

    # categorical_crossentropy主要是对多分类的一个损失,但是seq2seq不仅仅是一个结果,而是seq_length个多分类问题,是否还可以用categorical_crossentropy?
    # 这个疑惑在这个例子中看到答案:https://keras.io/examples/lstm_seq2seq/
    # 我猜,keras的代码中应该是做了判断,如果是多个categorical_crossentropy,应该会K.reduce_mean()一下吧。。。
    train_model.compile(optimizer=opt,
                        loss='categorical_crossentropy',
                        metrics=[words_accuracy])

    train_model.summary()

    return train_model
Ejemplo n.º 5
0
    def call(self, inputs, verbose=False, mask=None):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        # 注意,encoder_out_seq是一个数组,长度是seq;decoder_out_seq是一个输出。
        encoder_out_seq, decoder_out_seq = inputs

        logger.debug("encoder_out_seq.shape:%r", encoder_out_seq.shape)
        logger.debug("decoder_out_seq.shape:%r", decoder_out_seq.shape)

        encoder_out_seq = _p(encoder_out_seq,
                             "注意力调用:入参编码器输出序列:encoder_out_seq")
        decoder_out_seq = _p(decoder_out_seq,
                             "注意力调用:入参解码器输出序列:decoder_out_seq")

        # 实现了能量函数,e_tj=V * tanh ( W * h_j + U * S_t-1 + b )
        # inputs,我理解就是所有的h_j,错!我之前理解错了,这个参数是指某个时刻t,对应的输入!不是所有,是某个时刻的输入。
        #        按为什么还有个s,input+s,是因为batch。
        # states,我理解就是S_t-1
        # decode_outs是不包含seq的,不是一个decode_out_seq,而是decode_out,为何加s呢,是因为batch
        # 但是每一步都是encoder_out_seq全都参与运算的,
        # decoder_out一个和encoder_out_seq一串,对
        def energy_step(decode_outs, states):  # decode_outs(batch,dim)

            # decoder_seq [N,30,512] 30是字符串长度
            en_seq_len, en_hidden = encoder_out_seq.shape[
                1], encoder_out_seq.shape[2]  # 30, 512
            de_hidden = decode_outs.shape[-1]
            #  W * h_j
            reshaped_enc_outputs = K.reshape(
                encoder_out_seq, (-1, en_hidden))  #[b,64,512]=> [b*64,512]

            # W_a[512x512],reshaped_enc_outputs[b*64,512] => [b*64,512] => [b,64,512]
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a),
                                  (-1, en_seq_len, en_hidden))

            # U * S_t - 1,decode_outs[b,512],U_a[512,512] => [b,512]    => [b,1,512]
            U_a_dot_h = K.expand_dims(K.dot(decode_outs, self.U_a),
                                      axis=1)  # <= batch_size, 1, latent_dim

            # 这个细节很变态,其实就是完成了decoder的输出复制time(64)个,和encoder的输出【64,512】,相加的过程

            # tanh ( W * h_j + U * S_t-1 + b ),[b,64,512] = [b*64,512]
            reshaped_Ws_plus_Uh = K.tanh(
                K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))

            # V * tanh ( W * h_j + U * S_t-1 + b ), [b*64,512]*[512,1] => [b*64,1] => [b,64]
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a),
                            (-1, en_seq_len))

            e_i = K.softmax(e_i)

            return e_i, [e_i]

        # 这个step函数有意思,特别要关注他的入参:
        # encoder_out_seq: 编码器的各个time sequence的输出h_i,[batch,ts,dim]
        # states:
        # inputs:某个时刻,这个rnn的输入,这里,恰好是之前那个能量函数eij对应这个时刻的概率
        # ----------------------------
        # "step_do 这个函数,这个函数接受两个输入:step_in 和 states。
        #   其中 step_in 是一个 (batch_size, input_dim) 的张量(去掉了(batch,seq,input_dim)中的seq
        #   代表当前时刻的样本 xt,而 states 是一个 list,代表 yt−1 及一些中间变量。"
        # e 是30次中的一次,他是一个64维度的概率向量
        def context_step(e, states):  # e (batch,dim),其实每个输入就是一个e

            # encoder_out_seq[b,64,512] * e[64,1]
            # dot是矩阵相乘,*是对应位置元素相乘
            # [b,64,512] * e[64,1]shape不一样,居然也可以乘,我试了,没问题 ===> [b,512,1]
            # 其实,就是实现了encoder ouput根据softmax概率分布,加权求和
            c_i = K.sum(encoder_out_seq * K.expand_dims(e, -1), axis=1)
            c_i = _p(c_i, "context_step:c_i,算h的期望,也就是注意力了")
            return c_i, [c_i]

        #    (batch_size, enc_seq_len, hidden_size) (b,64,512)
        # => (batch_size, hidden_size)
        # 这个函数是,作为GRU的初始状态值,
        def create_inital_state(inputs, hidden_size):  # hidden_size=64
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(
                inputs)  # [b,64,512]<= (batch_size, enc_seq_len, latent_dim)
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = tile(
                fake_state,
                [1, hidden_size])  # <= (batch_size, latent_dim) (b,64)
            return fake_state

        # encoder_out_seq = (batch_size, enc_seq_len, latent_dim)
        # fake_state_c ==   (batch_size, latent_dim)
        # fake_state_e ==   (batch_size, enc_seq) , 最后这个维度不好理解,其实就是attention模型里面的decoder对应的每个步骤的attention这个序列,是一个值
        # K.rnn(计算函数,输入x,初始状态): K.rnn 这个函数,接受三个基本参数,其中第一个参数就是刚才写好的 step_do 函数,第二个参数则是输入的时间序列,第三个是初始态
        # 这个rnn就是解码器,输入 eji=a(s_i-1,hj),其中j要遍历一遍,这个k.rnn就是把每个hj对应的eij都计算一遍
        # 输出e_outputs,就是一个概率序列
        # eij(i不变,j是一个encoder的h下标),灌入到一个新的rnn中,让他计算出对应的输出,这个才是真正的Decoder!!!
        # shape[1]是seq=64,序列长度
        fake_state_e = create_inital_state(
            encoder_out_seq, encoder_out_seq.shape[1]
        )  # encoder_out_seq.shape[1]) , fake_state_e (batch,enc_seq_len)
        fake_state_e = _p(fake_state_e, "fake_state_e")

        # 输出是一个e的序列,是对一个时刻而言的
        # K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|
        # 这个步骤是做了30次(decoder,也就是字符串长度),每次得到一个64维度(encoder的time_sequence)的概率向量
        last_out, e_outputs, _ = K.rnn(
            step_function=energy_step,
            inputs=decoder_out_seq,
            initial_states=[fake_state_e],  #[b,64]
        )
        # e_outputs [30,64]

        e_outputs = _p(e_outputs, "能量函数e输出")
        # shape[-1]是encoder的隐含层

        fake_state_c = create_inital_state(encoder_out_seq,
                                           encoder_out_seq.shape[-1])  #

        # K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|K.rnn|
        last_out, c_outputs, _ = K.rnn(  # context_step算注意力的期望,sum(eij*encoder_out), 输出的(batch,encoder_seq,)
            step_function=context_step,
            inputs=e_outputs,
            initial_states=[fake_state_c],  # [b,1024]
        )
        # c_outputs [b,64,512]
        c_outputs = _p(c_outputs, "注意力c输出Shape")

        # 输出:
        # 注意力c_outputs的向量(batch,图像seq,512),
        # 注意力e_outputs的向量(batch,图像seq),
        # 能量函数e输出::::[3 29 64]
        # 注意力c输出::::  [3 29 1024]
        return c_outputs, e_outputs
Ejemplo n.º 6
0
 def squeeze_wrapper(self, tensor):
     tensor = _p(tensor, "Resnet50卷基层输出%%%%%%%%%%%%%%%%%%%%%%%%%%%")
     return squeeze(tensor, axis=1)