예제 #1
0
 def call(self, inputs):
     pos_ids = self.compute_position_ids(inputs)
     return K.gather(self.embeddings, pos_ids)
               
train_path=PATH+'generate/pkl/train_all.pkl'
train_file=pickle.load(open(train_path,'rb'))
# train_len=int(0.1*len(train_file))
# train_file=train_file[:train_len]
np.random.shuffle(train_file)
val_len=int(0.8*len(train_file))
valid_data=get_valid(train_file[-50000:])
# print(valid_data[:10])
train_generator = data_generator(train_file[:val_len], batch_size=batch_size)
valid_generator=data_generator(train_file[val_len:],batch_size=batch_size)



model,Seq_crf,Tag_crf = build_model(embeddings=200,vocab_size=vocab_size,rnn_units=300)
Seq_ner = NamedEntityRecognizer(trans=K.eval(Seq_crf.trans))
Tag_ner = NamedEntityRecognizer(trans=K.eval(Tag_crf.trans))


evaluator = Evaluator(valid_data,model,Seq_ner,Tag_ner)
early_stopping = EarlyStopping(monitor='val_tag_crf_Sparse_accuracy', patience=10)  # 早停法,防止过拟合
plateau = ReduceLROnPlateau(monitor='val_tag_crf_Sparse_accuracy', verbose=1, mode='max', factor=0.5, patience=3)  # 当评价指标不在提升时,减少学习率
# checkpoint = ModelCheckpoint('./model/best_0105.hdf5', monitor='val_tag_crf_Sparse_accuracy', verbose=2, save_best_only=True, mode='max',
#                                      save_weights_only=True)  # 保存最好的模型


model.fit(
        train_generator.forfit(),
        steps_per_epoch=len(train_generator),
        epochs=epochs,
        validation_data=valid_generator.forfit(),
# 加载预训练模型
bert = build_transformer_model(
    config_path=config_path,
    checkpoint_path=checkpoint_path,
    return_keras_model=False,
)

# 预测subject
output = Dense(units=2,
               activation='sigmoid',
               kernel_initializer=bert.initializer)(bert.model.output)
subject_preds = Lambda(lambda x: x**2)(output)

mask = bert.model.get_layer('Embedding-Token').output_mask
mask = K.cast(mask, K.floatx())

# subject_loss = K.binary_crossentropy(subject_labels, subject_preds)
# subject_loss = K.mean(subject_loss, 2)
# subject_loss = K.sum(subject_loss * mask) / K.sum(mask)

subject_model = Model(bert.model.inputs, subject_preds)

subject_model.compile(
    # loss = subject_loss,
    loss="binary_crossentropy",
    optimizer=Adam(learning_rate),
    # metrics=['accuracy']
)
# subject_model.load_weights('best_model.weights')
 def compute_loss(self, inputs, mask=None):
     subject_labels, object_labels = inputs[:2]
     subject_preds, object_preds, _ = inputs[2:]
     if mask[4] is None:
         mask = 1.0
     else:
         mask = K.cast(mask[4], K.floatx())
     # sujuect部分loss
     subject_loss = K.binary_crossentropy(subject_labels, subject_preds)
     subject_loss = K.mean(subject_loss, 2)
     subject_loss = K.sum(subject_loss * mask) / K.sum(mask)
     # object部分loss
     object_loss = K.binary_crossentropy(object_labels, object_preds)
     object_loss = K.sum(K.mean(object_loss, 3), 2)
     object_loss = K.sum(object_loss * mask) / K.sum(mask)
     # 总的loss
     return subject_loss + object_loss
예제 #5
0

model = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字,精简原字表
)

model.summary()

# 交叉熵作为loss,并mask掉输入部分的预测
y_true = model.input[0][:, 1:]  # 目标tokens
y_mask = model.input[1][:, 1:]
y_pred = model.output[:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))


class AutoTitle(AutoRegressiveDecoder):
    """seq2seq解码器
    """
    @AutoRegressiveDecoder.set_rtype('probas')
    def predict(self, inputs, output_ids, step):
        token_ids, segment_ids = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        segment_ids = np.concatenate(
            [segment_ids, np.ones_like(output_ids)], 1)
예제 #6
0
 def new_update(x, new_x):
     new_x = K.switch(cond, new_x, x)
     return old_update(x, new_x)
예제 #7
0
 def reset_old_weights(self):
     """恢复模型到旧权重。
     """
     K.batch_set_value(zip(self.model_weights, self.old_weights))
예제 #8
0
 def dense_loss(self, y_true, y_pred):
     """y_true需要是one hot形式
     """
     y_true = K.argmax(y_true, 2)
     return self.sparse_loss(y_true, y_pred)
예제 #9
0
 def basic_accuracy(self, y_true, y_pred, go_backwards=False):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2)
     mask = K.cast(mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 反转相关
     if self.hidden_dim is None:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             trans = K.transpose(self.trans)
         else:
             trans = self.trans
         histoty = K.gather(trans, y_true)
     else:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             r_trans, l_trans = self.l_trans, self.r_trans
         else:
             l_trans, r_trans = self.l_trans, self.r_trans
         histoty = K.gather(l_trans, y_true)
         histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans)
     # 计算逐标签accuracy
     histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1)
     y_pred = (y_pred + histoty) / 2
     y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
     isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
     return K.sum(isequal * mask) / K.sum(mask)
예제 #10
0
 def reverse_sequence(self, inputs, mask=None):
     if mask is None:
         return [x[:, ::-1] for x in inputs]
     else:
         length = K.cast(K.sum(mask, 1), 'int32')
         return [tf.reverse_sequence(x, length, seq_axis=1) for x in inputs]
예제 #11
0
 def basic_loss(self, y_true, y_pred, go_backwards=False):
     """y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2)
     mask = K.cast(mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 反转相关
     if self.hidden_dim is None:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             trans = K.transpose(self.trans)
         else:
             trans = self.trans
         histoty = K.gather(trans, y_true)
     else:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             r_trans, l_trans = self.l_trans, self.r_trans
         else:
             l_trans, r_trans = self.l_trans, self.r_trans
         histoty = K.gather(l_trans, y_true)
         histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans)
     # 计算loss
     histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1)
     y_pred = (y_pred + histoty) / 2
     loss = K.sparse_categorical_crossentropy(y_true,
                                              y_pred,
                                              from_logits=True)
     return K.sum(loss * mask) / K.sum(mask)
예제 #12
0
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = K.cast(mask, K.floatx())

        return sequence_masking(inputs, mask, 1, 1)
예제 #13
0
 def sparse_accuracy(self, y_true, y_pred):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2)
     mask = K.cast(mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 逐标签取最大来粗略评测训练效果
     y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
     isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
     return K.sum(isequal * mask) / K.sum(mask)
예제 #14
0
 def compute_position_ids(self, inputs):
     """T5的相对位置分桶(直接翻译自官方T5源码)
     """
     q, v = inputs
     # 计算位置差
     q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
     q_idxs = K.expand_dims(q_idxs, 1)
     v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
     v_idxs = K.expand_dims(v_idxs, 0)
     pos_ids = v_idxs - q_idxs
     # 后处理操作
     num_buckets, max_distance = self.input_dim, self.max_distance
     ret = 0
     n = -pos_ids
     if self.bidirectional:
         num_buckets //= 2
         ret += K.cast(K.less(n, 0), 'int32') * num_buckets
         n = K.abs(n)
     else:
         n = K.maximum(n, 0)
     # now n is in the range [0, inf)
     max_exact = num_buckets // 2
     is_small = K.less(n, max_exact)
     val_if_large = max_exact + K.cast(
         K.log(K.cast(n, K.floatx()) / max_exact) /
         np.log(max_distance / max_exact) * (num_buckets - max_exact),
         'int32',
     )
     val_if_large = K.minimum(val_if_large, num_buckets - 1)
     ret += K.switch(is_small, n, val_if_large)
     return ret
예제 #15
0
 def _resource_apply_sparse(self, grad, var, indices):
     grad = tf.IndexedSlices(grad, indices, K.shape(var))
     grad = tf.convert_to_tensor(grad)
     return self._resource_apply_dense(grad, var)
예제 #16
0
 def dense_accuracy(self, y_true, y_pred):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是one hot形式
     """
     y_true = K.argmax(y_true, 2)
     return self.sparse_accuracy(y_true, y_pred)
예제 #17
0
 def _decayed_lr(self, var_dtype):
     lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule)
     lr_t = super(NewOptimizer, self)._decayed_lr(var_dtype)
     return lr_t * K.cast(lr_multiplier, var_dtype)
예제 #18
0
            if label > 0:
                if label % 2 == 1:
                    starting = True
                    entities.append([[i], id2label[str((label - 1) // 2)]])
                elif starting:
                    entities[-1][0].append(i)
                else:
                    starting = False
            else:
                starting = False

        return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
                for w, l in entities]


NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])


def evaluate(data):
    """评测函数
    """
    X, Y, Z = 1e-10, 1e-10, 1e-10
    for d in tqdm(data):
        text = ''.join([i[0] for i in d])
        R = set(NER.recognize(text))
        T = set([tuple(i) for i in d if i[1] != 'O'])
        X += len(R & T)
        Y += len(R)
        Z += len(T)
    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
    return f1, precision, recall
예제 #19
0
 def new_update(x, new_x):
     if is_one_of(x, params) and self._do_lazy_optimization(x):
         g = self.grads[x]
         r = K.any(K.not_equal(g, 0.0), axis=-1, keepdims=True)
         new_x = x + (new_x - x) * K.cast(r, K.floatx())
     return old_update(x, new_x)
예제 #20
0
output = Dense(units=len(predicate2id) * 2,
               activation='sigmoid',
               kernel_initializer=bert.initializer)(output)
output = Lambda(lambda x: x**4)(output)
object_preds = Reshape((-1, len(predicate2id), 2))(output)

object_model = Model(bert.model.inputs + [subject_ids], object_preds)

# 训练模型
train_model = Model(
    bert.model.inputs + [subject_labels, subject_ids, object_labels],
    [subject_preds, object_preds])
train_model.summary()

mask = bert.model.get_layer('Embedding-Token').output_mask
mask = K.cast(mask, K.floatx())

subject_loss = K.binary_crossentropy(subject_labels, subject_preds)
subject_loss = K.mean(subject_loss, 2)
subject_loss = K.sum(subject_loss * mask) / K.sum(mask)

object_loss = K.binary_crossentropy(object_labels, object_preds)
object_loss = K.sum(K.mean(object_loss, 3), 2)
object_loss = K.sum(object_loss * mask) / K.sum(mask)

train_model.add_loss(subject_loss + object_loss)

optimizer = Adam(learning_rate)
train_model.compile(optimizer=optimizer)

from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
# from bert4keras.snippets import open, groupby
from keras.layers import Input, Dense, Lambda, Reshape
from keras.models import Model
from tqdm import tqdm
import os
import tensorflow as tf

# 设置gpu
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # 使用编号为1,2号的GPU
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.98
session = tf.Session(config=config)
K.set_session(session)

# 基本信息
maxlen = 320
epochs = 20
batch_size = 16
learning_rate = 2e-5

# bert配置
path = "../bert/"
# path = "data/"
config_path = path + 'chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = path + 'chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = path + 'chinese_L-12_H-768_A-12/vocab.txt'

예제 #22
0
 def beta2(self):
     if self._beta2 is None:
         iterations = K.cast(self.iterations + 1, K.floatx())
         return 1.0 - K.pow(iterations, -0.8)
     else:
         return self._beta2
예제 #23
0
        Z += len(T)
    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
    return f1, precision, recall


#构建自己的验证集
#dev_data = json.load(open("/home/wq/ner/valid_data.json",encoding="utf-8"))["valid_data"]
#print(dev_data[0])
if __name__ == '__main__':
    import sys
    arg = sys.argv
    if arg[1] == "train":
        normal_train = True
        cross_train = False
        model, CRF = build_model()
        NER = NamedEntityRecognizer(trans=K.eval(CRF.trans),
                                    starts=[0],
                                    ends=[0])
        train_data = load_data('/home/wq/ner/train/ner.train')
        valid_data = load_data('/home/wq/ner/train/ner.valid')
        dev_data = json.load(
            open("/home/wq/ner/valid_data.json",
                 encoding="utf-8"))["valid_data"]
        evaluator = Evaluator()
        train_generator = data_generator(train_data, batch_size)

        model.fit_generator(train_generator.forfit(),
                            steps_per_epoch=len(train_generator),
                            epochs=epochs,
                            callbacks=[evaluator])
예제 #24
0
 def __init__(self, *args, **kwargs):
     super(AdaFactorV1, self).__init__(*args, **kwargs)
     with K.name_scope(self.__class__.__name__):
         self.iterations = K.variable(0, dtype='int64', name='iterations')
예제 #25
0
                batch_token_ids, batch_segment_ids = [], []


model = build_bert_model(
    config_path,
    checkpoint_path,
    application='seq2seq',
    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字,精简原字表
)
model.summary()

# 交叉熵作为loss,并mask掉输入部分的预测
y_in = model.input[0][:, 1:]  # 目标tokens
y_mask = model.input[1][:, 1:]
y = model.output[:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))


class ReadingComprehension(AutoRegressiveDecoder):
    """beam search解码来生成答案
    passages为多篇章组成的list,从多篇文章中自动决策出最优的答案,
    如果没答案,则返回空字符串。
    mode是extractive时,按照抽取式执行,即答案必须是原篇章的一个片段。
    """
    def __init__(self, start_id, end_id, maxlen, mode='extractive'):
        super(ReadingComprehension, self).__init__(start_id, end_id, maxlen)
        self.mode = mode
예제 #26
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        self.weights = [self.iterations]
        lr = self.learning_rate

        for i, (p, g) in enumerate(zip(params, grads)):
            g2 = K.square(g) + self.epsilon1
            shape, d_type = K.int_shape(p), K.dtype(p)
            factored_shape = self.factored_shape(shape)
            if factored_shape is None:
                # 定义参数
                v = K.zeros(shape, dtype=d_type, name='v_' + str(i))
                self.weights.append(v)
                # 定义更新
                v_t = self.beta2 * v + (1.0 - self.beta2) * g2
                self.updates.append(K.update(v, v_t))
            else:
                # 定义参数
                shape1, axis1, shape2, axis2 = factored_shape
                vr = K.zeros(shape1, dtype=d_type, name='vr_' + str(i))
                vc = K.zeros(shape2, dtype=d_type, name='vc_' + str(i))
                self.weights.extend([vr, vc])
                # 定义更新
                vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True)
                vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True)
                self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)])
                # 合成矩阵
                v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
            # 增量主体
            u = g / K.sqrt(v_t)
            # 增量裁剪
            if self.clipping_threshold is not None:
                u_rms = K.mean(K.sum(K.square(u)))
                d = self.clipping_threshold
                u = u / K.maximum(1.0, u_rms / d)
            # 增量滑动
            if self.beta1 > 0.0:
                # 定义参数
                m = K.zeros(shape, dtype=d_type, name='m_' + str(i))
                self.weights.append(m)
                # 定义更新
                m_t = self.beta1 * m + (1.0 - self.beta1) * u
                self.updates.append(K.update(m, m_t))
                u = m_t
            # 增量调整
            if self.multiply_by_parameter_scale:
                u = u * K.maximum(K.mean(K.sum(K.square(p))), self.epsilon2)
            # 更新参数
            self.updates.append(K.update(p, p - lr * u))

        return self.updates
예제 #27
0
    """
    y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
    y_true = K.cast(y_true, 'int32')
    y_true = K.one_hot(y_true, K.shape(y_pred)[-1])
    return K.categorical_crossentropy(y_true, y_pred)


o_in = Input(shape=(None, ))
train_model = Model(model.inputs + [o_in], model.outputs + [o_in])

# 交叉熵作为loss,并mask掉输入部分的预测
y_true = train_model.input[2][:, 1:]  # 目标tokens
y_mask = train_model.input[1][:, 1:]
y_pred = train_model.output[0][:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = sparse_categorical_crossentropy(y_true, y_pred)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

embeddings = search_layer(train_model.output[0], 'Embedding-Token').embeddings
gp = K.sum(K.gradients(cross_entropy, [embeddings])[0].values**2)

train_model.add_loss(cross_entropy + 0.5 * gp)
train_model.compile(optimizer=Adam(1e-5))

# train_model.add_loss(cross_entropy)
# train_model.compile(optimizer=Adam(1e-5))


class AutoTitle(AutoRegressiveDecoder):
    """seq2seq解码器
    """
    @AutoRegressiveDecoder.set_rtype('probas')
예제 #28
0
 def _resource_apply(self, grad, var, indices=None):
     lr = self.learning_rate
     g2 = K.square(grad) + self.epsilon1
     shape = K.int_shape(var)
     factored_shape = self.factored_shape(shape)
     if factored_shape is None:
         v = self.get_slot(var, 'v')
         # 定义更新
         v_t = self.beta2 * v + (1.0 - self.beta2) * g2
         v_t = K.update(v, v_t)
     else:
         shape1, axis1, shape2, axis2 = factored_shape
         vr = self.get_slot(var, 'vr')
         vc = self.get_slot(var, 'vc')
         # 定义更新
         vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True)
         vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True)
         vr_t, vc_t = K.update(vr, vr_t), K.update(vc, vc_t)
         # 合成矩阵
         v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
     # 增量主体
     u = grad / K.sqrt(v_t)
     # 增量裁剪
     if self.clipping_threshold is not None:
         u_rms = K.mean(K.sum(K.square(u)))
         d = self.clipping_threshold
         u = u / K.maximum(1.0, u_rms / d)
     # 增量滑动
     if self.beta1 > 0.0:
         m = self.get_slot(var, 'm')
         # 定义更新
         m_t = self.beta1 * m + (1.0 - self.beta1) * u
         u = K.update(m, m_t)
     # 增量调整
     if self.multiply_by_parameter_scale:
         u = u * K.maximum(K.mean(K.sum(K.square(var))), self.epsilon2)
     # 更新参数
     return K.update(var, var - lr * u)
예제 #29
0
# 加载预训练模型
bert = build_bert_model(
    max_seq_len=MAX_SEQ_LEN,
    config_path=config_path,
    checkpoint_path=None,
    with_pool=True,
    return_keras_model=False,
)

output = Dropout(rate=0.1)(bert.model.output)
output = Dense(units=1, kernel_initializer=bert.initializer)(output)
model = Model(bert.model.input, output)

output = Lambda(lambda x: x[:, 0], name='Squeeze')(output)
toutput = Lambda(lambda x: K.reshape(x, [-1, NUM_TRAIN_CANDS]),
                 name='Reshape')(output)
tprobs = Softmax(name='Softmax')(toutput)
train_model = Model(bert.model.input, tprobs)

poutput = Lambda(lambda x: K.reshape(x, [-1, NUM_CANDS]),
                 name='Reshape')(output)
pprobs = Softmax(name='Softmax')(poutput)
predict_model = Model(bert.model.input, pprobs)

valid_data = {}
test_data = {}
entities2name = {}


def _json_object_hook(d):
예제 #30
0
 def call(self, inputs, mask=None, a_mask=None, p_bias=None):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     p_bias: 在attention里的位置偏置。
             一般用来指定相对位置编码的种类。
     """
     q, k, v = inputs[:3]
     q_mask, v_mask, n = None, None, 3
     if mask is not None:
         if mask[0] is not None:
             q_mask = K.cast(mask[0], K.floatx())
         if mask[2] is not None:
             v_mask = K.cast(mask[2], K.floatx())
     if a_mask:
         a_mask = inputs[n]
         n += 1
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 处理位置编码
     if p_bias == 'typical_relative':
         pos_embeddings = inputs[n]
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     elif p_bias == 't5_relative':
         pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1))
         a = a + K.expand_dims(pos_embeddings, 0)
     # Attention(续)
     if self.scaled_dot_product:
         a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if p_bias == 'typical_relative':
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     # 返回结果
     o = sequence_masking(o, q_mask, 0)
     return o