def sequence_masking(x, mask, mode=0, axis=None, heads=1): """为序列条件mask的函数 mask: 形如(batch_size, seq_len)的0-1矩阵; mode: 如果是0,则直接乘以mask; 如果是1,则在padding部分减去一个大正数。 axis: 序列所在轴,默认为1; heads: 相当于batch这一维要被重复的次数。 """ if mask is None or mode not in [0, 1]: return x else: if heads is not 1: mask = K.expand_dims(mask, 1) mask = K.tile(mask, (1, heads, 1)) mask = K.reshape(mask, (-1, K.shape(mask)[2])) if axis is None: axis = 1 if axis == -1: axis = K.ndim(x) - 1 assert axis > 0, 'axis muse be greater than 0' for _ in range(axis - 1): mask = K.expand_dims(mask, 1) for _ in range(K.ndim(x) - K.ndim(mask) - axis + 1): mask = K.expand_dims(mask, K.ndim(mask)) if mode == 0: return x * mask else: return x - (1 - mask) * 1e12
def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) 对所有模型使用 32 个嵌入,其数值范围的大小以对数方式增加,最大偏移量为128,超过此偏移量,所有相对位置使用同一嵌入。 需要注意的是,某一给定层对超过 128 的相对位置不敏感,但是后续层可以通过组合来自先前层的局部信息来建立对更大偏移的敏感性。 """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def log_norm_step(self, inputs, states): """递归计算归一化因子 要点:1、递归计算;2、用logsumexp避免溢出。 """ inputs, mask = inputs[:, :-1], inputs[:, -1:] states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1) trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim) outputs = K.logsumexp(states + trans, 1) # (batch_size, output_dim) outputs = outputs + inputs outputs = mask * outputs + (1 - mask) * states[:, :, 0] return outputs, [outputs]
def compute_position_ids(self, inputs): q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 max_position = (self.input_dim - 1) // 2 pos_ids = K.clip(pos_ids, -max_position, max_position) pos_ids = pos_ids + max_position return pos_ids
def call(self, inputs): """ 如果是条件Layer Norm,则默认以list为输入,第二个是condition 条件Layer Norm的时候,beta,gamma的之后由传入的cond有关。 """ if self.conditional: inputs, cond = inputs if self.hidden_units is not None: cond = self.hidden_dense(cond) for _ in range(K.ndim(inputs) - K.ndim(cond)): cond = K.expand_dims(cond, 1) if self.center: beta = self.beta_dense(cond) + self.beta if self.scale: gamma = self.gamma_dense(cond) + self.gamma else: if self.center: beta = self.beta if self.scale: gamma = self.gamma """ LayerNormalization的计算公式,先计算mean值以及方差,然后再乘以gamma,加上beta。 """ outputs = inputs if self.center: mean = K.mean(outputs, axis=-1, keepdims=True) outputs = outputs - mean if self.scale: variance = K.mean(K.square(outputs), axis=-1, keepdims=True) std = K.sqrt(variance + self.epsilon) outputs = outputs / std * gamma if self.center: outputs = outputs + beta return outputs
def call(self, inputs): """如果是条件Layer Norm,则默认以list为输入,第二个是condition """ if self.conditional: inputs, cond = inputs if self.hidden_units is not None: cond = self.hidden_dense(cond) for _ in range(K.ndim(inputs) - K.ndim(cond)): cond = K.expand_dims(cond, 1) if self.center: beta = self.beta_dense(cond) + self.beta if self.scale: gamma = self.gamma_dense(cond) + self.gamma else: if self.center: beta = self.beta if self.scale: gamma = self.gamma outputs = inputs if self.center: mean = K.mean(outputs, axis=-1, keepdims=True) outputs = outputs - mean if self.scale: variance = K.mean(K.square(outputs), axis=-1, keepdims=True) std = K.sqrt(variance + self.epsilon) outputs = outputs / std outputs = outputs * gamma if self.center: outputs = outputs + beta return outputs
def compute_mask(self, inputs, mask=None): if self.conditional: masks = [K.expand_dims(m, 0) for m in mask if m is not None] if len(masks) == 0: return None else: return K.all(K.concatenate(masks, axis=0), axis=0) else: return mask
def call(self, inputs): input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] pos_embeddings = self.embeddings[:seq_len] pos_embeddings = K.expand_dims(pos_embeddings, 0) pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) if self.merge_mode == 'add': return inputs + pos_embeddings else: return K.concatenate([inputs, pos_embeddings])
def call(self, inputs, mask=None, a_mask=None, p_bias=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 """ q, k, v = inputs[:3] q_mask, v_mask, n = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) if a_mask: a_mask = inputs[n] n += 1 # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': pos_embeddings = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) elif p_bias == 't5_relative': pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(pos_embeddings, 0) # Attention(续) if self.attention_scale: a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.head_size * self.heads)) o = self.o_dense(o) # 返回结果 o = sequence_masking(o, q_mask, 0) return o
def call(self, inputs): input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] if self.offset_positions_by_padding and self.padding_idx: pos_embeddings = self.embeddings[self.padding_idx + 1:seq_len + self.padding_idx + 1] else: pos_embeddings = self.embeddings[self.padding_idx] pos_embeddings = K.expand_dims(pos_embeddings, 0) pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) if self.merge_mode == 'add': return inputs + pos_embeddings else: return K.concatenate([inputs, pos_embeddings])
def call(self, inputs): input_shape = K.shape(inputs) # (btz, seq_len, 768) batch_size, seq_len = input_shape[0], input_shape[1] pos_embeddings = self.embeddings[: seq_len] # (seq_len, self.embedding_size) pos_embeddings = K.expand_dims(pos_embeddings, 0) # (1, seq_len, self.embedding_size) if self.merge_mode == 'add': return inputs + pos_embeddings # (btz, seq_len, 768)相加后的shape else: pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) return K.concatenate([inputs, pos_embeddings])
def pay_attention_to(self, inputs, mask=None, **kwargs): """实现标准的乘性多头注意力 a_bias: 对attention矩阵的bias。 不同的attention bias对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 说明: 这里单独分离出pay_attention_to函数,是为了方便 继承此类来定义不同形式的atttention;此处要求 返回o.shape=(batch_size, seq_len, heads, head_size)。 """ (qw, kw, vw), n = inputs[:3], 3 q_mask, v_mask = mask a_bias, p_bias = kwargs.get('a_bias'), kwargs.get('p_bias') if a_bias: a_bias = inputs[n] n += 1 if p_bias == 'rotary': cos_pos = K.repeat_elements(inputs[n][..., None, 1::2], 2, -1) sin_pos = K.repeat_elements(inputs[n][..., None, ::2], 2, -1) qw2 = K.stack([-qw[..., 1::2], qw[..., ::2]], 4) qw2 = K.reshape(qw2, K.shape(qw)) qw = qw * cos_pos + qw2 * sin_pos kw2 = K.stack([-kw[..., 1::2], kw[..., ::2]], 4) kw2 = K.reshape(kw2, K.shape(kw)) kw = kw * cos_pos + kw2 * sin_pos # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': position_bias = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, position_bias) elif p_bias == 't5_relative': position_bias = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(position_bias, 0) # Attention(续) if self.attention_scale: a = a / self.key_size**0.5 if a_bias is not None: a = a + a_bias a = sequence_masking(a, v_mask, '-inf', -1) A = K.softmax(a) if self.attention_dropout: A = Dropout(self.attention_dropout)(A) # 完成输出 # 如果是相对位置编码,还要加上attention 矩阵乘法 pos_embeddings o = tf.einsum('bhjk,bkhd->bjhd', A, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', A, position_bias) return o, a
def call(self, inputs): """如果inputs是一个list,则默认第二个输入是传入的位置id,否则 是默认顺序id,即[0, 1, 2, 3, ...] """ if isinstance(inputs, list): inputs, pos_ids = inputs pos_embeddings = K.gather(self.embeddings, pos_ids) else: input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] pos_embeddings = self.embeddings[:seq_len] pos_embeddings = K.expand_dims(pos_embeddings, 0) pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) if self.merge_mode == 'add': return inputs + pos_embeddings else: return K.concatenate([inputs, pos_embeddings])
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ mask = self.output_mask # 计算目标分数 target_score = self.target_score(y_true, y_pred, mask) # 递归计算log Z init_states = [y_pred[:, 0]] if mask is None: mask = K.ones_like(y_pred[:, :, :1]) else: mask = K.expand_dims(mask, 2) y_pred = K.concatenate([y_pred, mask]) log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states) # 最后一步的log Z向量 log_norm = tf.reduce_logsumexp(log_norm, 1) # logsumexp得标量 # 计算损失 -log p return log_norm - target_score
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ if self.custom_position_ids: inputs, position_ids = inputs if K.dtype(position_ids) != 'int32': position_ids = K.cast(position_ids, 'int32') pos_embeddings = K.gather(self.embeddings, position_ids) else: input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] pos_embeddings = self.embeddings[:seq_len] pos_embeddings = K.expand_dims(pos_embeddings, 0) if self.merge_mode != 'add': pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) if self.merge_mode == 'add': return inputs + pos_embeddings else: return K.concatenate([inputs, pos_embeddings])
def call(self, inputs): """如果是条件Layer Norm,则默认以list为输入,第二个是condition """ if self.conditional: inputs, cond = inputs if self.hidden_units is not None: cond = self.hidden_dense(cond) for _ in range(K.ndim(inputs) - K.ndim(cond)): cond = K.expand_dims(cond, 1) beta = self.beta_dense(cond) gamma = self.gamma_dense(cond) beta, gamma = self.beta + beta, self.gamma + gamma else: beta, gamma = self.beta, self.gamma mean = K.mean(inputs, axis=-1, keepdims=True) variance = K.mean(K.square(inputs - mean), axis=-1, keepdims=True) std = K.sqrt(variance + self.epsilon) outputs = (inputs - mean) / std outputs = outputs * gamma + beta return outputs
def seq_gather(x: list): """ 传入从传入的列表x中获取句子张量seq和下标idxs seq是[batch_size, seq_len, vector_size]的形状, idxs是[batch_size, 1]的形状 在seq的第i个序列中选出第idxs[i]个向量, 最终输出[batch_size, s_size]的向量。 :param x: [seq, idxs] seq 原始序列的张量,idxs需要拆分的向量下标 :return: 收集出来的字向量 """ # 获取句子张量以及字下标张量 idx = [[4],[9],[8],[11],[23],[45],[60],[30]] seq, idxs = x # 将下标数据类型转化为整型 idxs = K.cast(idxs, 'int32') # 使用keras方法构造0-batch_size的张量[0,1,2,3,4,5,6,7] batch_idxs = K.arange(0, K.shape(seq)[0]) # 在batch_idxs中扩充维度1,为的是与idx进行拼接后到seq中取切分向量[[0],[1],[2],[3],[4],[5],[6],[7]] batch_idxs = K.expand_dims(batch_idxs, 1) # 拼接idxs与batch_idx [[0,4],[1,9],[2,8],[3,11],[4,23],[5,45],[6,60],[7,30]] idxs = K.concatenate([batch_idxs, idxs], 1) # 对应idxs下标将seq中对应位置的向量收集出来 return tf.gather_nd(seq, idxs)
def pay_attention_to(self, inputs, mask=None, **kwargs): """实现标准的乘性多头注意力 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 说明: 这里单独分离出pay_attention_to函数,是为了方便 继承此类来定义不同形式的atttention;此处要求 返回o.shape=(batch_size, seq_len, heads, head_size)。 """ (qw, kw, vw), n = inputs[:3], 3 q_mask, v_mask = mask a_mask, p_bias = kwargs.get('a_mask'), kwargs.get('p_bias') if a_mask: a_mask = inputs[n] n += 1 # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': pos_embeddings = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) elif p_bias == 't5_relative': pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(pos_embeddings, 0) # Attention(续) if self.attention_scale: a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) return o
# 加载预训练模型 bert_model = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, ) q_x_in = Input(shape=(None, ), name='Input-Token-query') # 待识别句子输入 q_s_in = Input(shape=(None, ), name='Input-Segment-query') # 待识别句子输入 q_start_in = Input(shape=(None, ), name='Input-Start-query') # 实体左边界(标签) q_end_in = Input(shape=(None, ), name='Input-End-query') # 实体右边界(标签) q_label_in = Input(shape=(None, ), name='Input-label-query') # 实体右边界(标签) # tokens, segments, q_st, q_en, q_label = q_x_in, q_s_in, q_start_in, q_end_in, q_label_in x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'), name='x_mask')(bert_model.model.inputs[0]) # 预测category x = Lambda(lambda x: x[:, 0], name='CLS_Token')(bert_model.model.output) out1 = Dropout(0.5, name='out1')(x) ps_category = Dense(units=len(classes), activation='sigmoid', name='ps_category')(out1) # 利用ps_category的信息 output = bert_model.model.layers[-2].get_output_at(-1) ps_heads = Dense(1, activation='sigmoid', use_bias=False, name='dps1')(bert_model.model.output) ps_heads = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10,
def build_model(): """ 调用模型参数,搭建事件抽取模型主体,先搭建触发词模型,然后围绕触发词下标搭建其他论元模型。 :return: 各个论元模型对象 """ with SESS.as_default(): with SESS.graph.as_default(): # 构建bert模型主体 bert_model = build_transformer_model( config_path=bert_config.config_path, checkpoint_path=bert_config.checkpoint_path, return_keras_model=False, model=bert_config.model_type) # l为模型内部的层名,格式为--str for l in bert_model.layers: bert_model.model.get_layer(l).trainable = True # 搭建模型 # keras会自动对所有的占位张量添加batch_size维度 # 动词输入 (batch_size, seq_len) trigger_start_in = Input(shape=(None, )) trigger_end_in = Input(shape=(None, )) # 动词下标输入 (batch_size, seq_len) trigger_index_start_in = Input(shape=(1, )) trigger_index_end_in = Input(shape=(1, )) # 宾语输入 (batch_size, seq_len) object_start_in = Input(shape=(None, )) object_end_in = Input(shape=(None, )) # 主语输入 (batch_size, seq_len) subject_start_in = Input(shape=(None, )) subject_end_in = Input(shape=(None, )) # 地点输入 (batch_size, seq_len) loc_start_in = Input(shape=(None, )) loc_end_in = Input(shape=(None, )) # 时间输入 (batch_size, seq_len) time_start_in = Input(shape=(None, )) time_end_in = Input(shape=(None, )) # 否定词输入 (batch_size, seq_len) negative_start_in = Input(shape=(None, )) negative_end_in = Input(shape=(None, )) # 将输入的占位符赋值给相应的变量(此处只是在使用时方便,没有其他的模型结构意义) # 动词输入 trigger_start, trigger_end = trigger_start_in, trigger_end_in # 动词下标 trigger_index_start, trigger_index_end = trigger_index_start_in, trigger_index_end_in # 宾语输入 object_start, object_end = object_start_in, object_end_in # 主语输入 subject_start, subject_end = subject_start_in, subject_end_in # 地点输入 loc_start, loc_end = loc_start_in, loc_end_in # 时间输入 time_start, time_end = time_start_in, time_end_in # 否定词输入 negative_start, negative_end = negative_start_in, negative_end_in # bert_model.model.inputs为列表格式,含有两个张量,[token_ids(batch, seq_len), segment_ids(batch, seq_len)] # mask操作,将bert模型的输入的token_ids序列,进行维度扩充[batch_size, seq_len, 1], # 然后将初始填充为0的地方全部都用0代替,非0的地方都用1占位, # 这是为后边计算损失做准备,防止计算损失时,前期填充为0的位置也进行反向传播 mask = Lambda(lambda x: K.cast( K.greater(K.expand_dims(x[0], 2), 0), 'float32'))( bert_model.model.inputs) # 计算动词输出的起始终止标签,bert_model.model.output [batch_size, seq_len, 768] trigger_start_out = Dense(1, activation='sigmoid')( bert_model.model.output) trigger_end_out = Dense(1, activation='sigmoid')( bert_model.model.output) # 预测trigger动词的模型 trigger_model = Model(bert_model.model.inputs, [trigger_start_out, trigger_end_out]) # 将动词下标对应位置的子向量抽取出来并计算均值 k1v = Lambda(seq_gather)( [bert_model.model.output, trigger_index_start]) k2v = Lambda(seq_gather)( [bert_model.model.output, trigger_index_end]) kv = Average()([k1v, k2v]) # 融合动词词向量的句子张量,用来作为预测其它论元部分的向量 t = LayerNormalization(conditional=True)( [bert_model.model.output, kv]) # 宾语模型输出 object_start_out = Dense(1, activation='sigmoid')(t) object_end_out = Dense(1, activation='sigmoid')(t) # 主语模型输出 subject_start_out = Dense(1, activation='sigmoid')(t) subject_end_out = Dense(1, activation='sigmoid')(t) # 地点模型输出 loc_start_out = Dense(1, activation='sigmoid')(t) loc_end_out = Dense(1, activation='sigmoid')(t) # 时间模型输出 time_start_out = Dense(1, activation='sigmoid')(t) time_end_out = Dense(1, activation='sigmoid')(t) # 否定词模型输出 negative_start_out = Dense(1, activation='sigmoid')(t) negative_end_out = Dense(1, activation='sigmoid')(t) # 输入text和trigger,预测object object_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [object_start_out, object_end_out]) # 输入text和trigger,预测subject subject_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [subject_start_out, subject_end_out]) # 输入text和trigger,预测loc loc_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [loc_start_out, loc_end_out]) # 输入text和trigger,预测time time_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [time_start_out, time_end_out]) # 否定词模型 negative_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [negative_start_out, negative_end_out]) # 主模型 train_model = Model( bert_model.model.inputs + [ trigger_start_in, trigger_end_in, trigger_index_start_in, trigger_index_end_in, object_start_in, object_end_in, subject_start_in, subject_end_in, loc_start_in, loc_end_in, time_start_in, time_end_in, negative_start_in, negative_end_in ], [ trigger_start_out, trigger_end_out, object_start_out, object_end_out, subject_start_out, subject_end_out, loc_start_out, loc_end_out, time_start_out, time_end_out, negative_start_out, negative_end_out ]) # 扩充维度, 构造成与mask矩阵相同的结构,方便后续计算模型各部分损失 trigger_start = K.expand_dims(trigger_start, 2) trigger_end = K.expand_dims(trigger_end, 2) object_start = K.expand_dims(object_start, 2) object_end = K.expand_dims(object_end, 2) subject_start = K.expand_dims(subject_start, 2) subject_end = K.expand_dims(subject_end, 2) loc_start = K.expand_dims(loc_start, 2) loc_end = K.expand_dims(loc_end, 2) time_start = K.expand_dims(time_start, 2) time_end = K.expand_dims(time_end, 2) negative_start = K.expand_dims(negative_start, 2) negative_end = K.expand_dims(negative_end, 2) # 构造模型损失函数,使用mask矩阵将前期填充为0的位置全部掩掉,不进行反向传播。 # 动词损失 trigger_start_loss = K.binary_crossentropy(trigger_start, trigger_start_out) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 trigger_start_loss = K.sum(trigger_start_loss * mask) / K.sum(mask) trigger_end_loss = K.binary_crossentropy(trigger_end, trigger_end_out) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 trigger_end_loss = K.sum(trigger_end_loss * mask) / K.sum(mask) # 宾语损失 object_start_loss = K.sum( K.binary_crossentropy(object_start, object_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 object_start_loss = K.sum(object_start_loss * mask) / K.sum(mask) object_end_loss = K.sum( K.binary_crossentropy(object_end, object_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 object_end_loss = K.sum(object_end_loss * mask) / K.sum(mask) # 主语损失 subject_start_loss = K.sum( K.binary_crossentropy(subject_start, subject_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 subject_start_loss = K.sum(subject_start_loss * mask) / K.sum(mask) subject_end_loss = K.sum( K.binary_crossentropy(subject_end, subject_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 subject_end_loss = K.sum(subject_end_loss * mask) / K.sum(mask) # 地点损失 loc_start_loss = K.sum( K.binary_crossentropy(loc_start, loc_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 loc_start_loss = K.sum(loc_start_loss * mask) / K.sum(mask) loc_end_loss = K.sum(K.binary_crossentropy(loc_end, loc_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 loc_end_loss = K.sum(loc_end_loss * mask) / K.sum(mask) # 时间损失 time_start_loss = K.sum( K.binary_crossentropy(time_start, time_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 time_start_loss = K.sum(time_start_loss * mask) / K.sum(mask) time_end_loss = K.sum(K.binary_crossentropy( time_end, time_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 time_end_loss = K.sum(time_end_loss * mask) / K.sum(mask) # 否定词损失 negative_start_loss = K.sum( K.binary_crossentropy(negative_start, negative_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 negative_start_loss = K.sum( negative_start_loss * mask) / K.sum(mask) negative_end_loss = K.sum( K.binary_crossentropy(negative_end, negative_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 negative_end_loss = K.sum(negative_end_loss * mask) / K.sum(mask) # 合并损失 loss = (trigger_start_loss + trigger_end_loss) + ( object_start_loss + object_end_loss) + (subject_start_loss + subject_end_loss) + ( loc_start_loss + loc_end_loss) + (time_start_loss + time_end_loss) + ( negative_start_loss + negative_end_loss) train_model.add_loss(loss) train_model.compile( optimizer=Adam(extract_train_config.learning_rate)) train_model.summary() return trigger_model, subject_model, object_model, time_model, loc_model, negative_model, train_model
def call(self, inputs, q_mask=None, v_mask=None, a_mask=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ q, k, v = inputs[:3] if a_mask: if len(inputs) == 3: a_mask = 'history_only' else: a_mask = inputs[3] if q_mask is not None: if not hasattr(self, 'q_mask_layer'): self.q_mask_layer = search_layer(q, q_mask) q_mask = self.q_mask_layer.output_mask if v_mask is not None: if not hasattr(self, 'v_mask_layer'): self.v_mask_layer = search_layer(v, v_mask) v_mask = self.v_mask_layer.output_mask # Pooling if self.pool_size > 1: is_self_attention = (q is k is v) q_in_len = K.shape(q)[1] q = sequence_masking(q, q_mask, 0) q = divisible_temporal_padding(q, self.pool_size) q = pool1d(q, self.pool_size, self.pool_size, pool_mode='avg') if is_self_attention: k = v = q else: k = sequence_masking(k, v_mask, 0) k = divisible_temporal_padding(k, self.pool_size) k = pool1d(k, self.pool_size, self.pool_size, pool_mode='avg') v = sequence_masking(v, v_mask, 0) v = divisible_temporal_padding(v, self.pool_size) v = pool1d(v, self.pool_size, self.pool_size, pool_mode='avg') if v_mask is not None: v_mask = v_mask[:, ::self.pool_size] if a_mask is not None and not is_string(a_mask): a_mask = a_mask[..., ::self.pool_size, ::self.pool_size] # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) # 恢复长度 if self.pool_size > 1: o = K.repeat_elements(o, self.pool_size, 1)[:, :q_in_len] # 返回结果 o = sequence_masking(o, q_mask, 0) return o
def call(self, inputs, q_mask=False, v_mask=False, a_mask=False): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ # 处理mask inputs = inputs[:] for i, mask in enumerate([q_mask, v_mask, a_mask]): if not mask: inputs.insert(3 + i, None) q, k, v, q_mask, v_mask = inputs[:5] if len(inputs) == 5: a_mask = 'history_only' elif len(inputs) == 6: a_mask = inputs[-1] else: raise ValueError('wrong inputs for MultiHeadAttention.') # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) o = sequence_masking(o, q_mask, 0) return o
def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) inputs = inputs - (1.0 - mask) * 1e12 return K.softmax(inputs, 1)