def build_model(self): import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allocator_type = 'BFC' # A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc. if self.memory_fraction: config.gpu_options.per_process_gpu_memory_fraction = self.memory_fraction config.gpu_options.allow_growth = False else: config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # 补充输入 subject_labels = Input(shape=(None, 2), name='Subject-Labels') subject_ids = Input(shape=(2, ), name='Subject-Ids') object_labels = Input(shape=(None, self.num_classes, 2), name='Object-Labels') # 加载预训练模型 bert = build_transformer_model( config_path=self.bert_config_path, checkpoint_path=self.bert_checkpoint_path, return_keras_model=False, ) # 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert.initializer)(bert.model.output) subject_preds = Lambda(lambda x: x**2)(output) self.subject_model = Model(bert.model.inputs, subject_preds) # 传入subject,预测object # 通过Conditional Layer Normalization将subject融入到object的预测中 output = bert.model.layers[-2].get_output_at(-1) subject = Lambda(self.extrac_subject)([output, subject_ids]) output = LayerNormalization(conditional=True)([output, subject]) output = Dense(units=self.num_classes * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Lambda(lambda x: x**4)(output) object_preds = Reshape((-1, self.num_classes, 2))(output) self.object_model = Model(bert.model.inputs + [subject_ids], object_preds) # 训练模型 self.model = Model( bert.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds]) mask = bert.model.get_layer('Embedding-Token').output_mask mask = K.cast(mask, K.floatx()) subject_loss = K.binary_crossentropy(subject_labels, subject_preds) subject_loss = K.mean(subject_loss, 2) subject_loss = K.sum(subject_loss * mask) / K.sum(mask) object_loss = K.binary_crossentropy(object_labels, object_preds) object_loss = K.sum(K.mean(object_loss, 3), 2) object_loss = K.sum(object_loss * mask) / K.sum(mask) self.model.add_loss(subject_loss + object_loss) AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA') self.optimizer = AdamEMA(lr=1e-4)
def build_model(): bert_model = build_transformer_model( config_path=Config.config_path, checkpoint_path=Config.checkpoint_path, return_keras_model=False) # 补充输入 subject_labels = Input(shape=(None, 2)) subject_ids = Input(shape=(2, )) object_labels = Input(shape=(None, len(predicate2id), 2)) # 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert_model.initializer)( bert_model.model.output) subject_preds = Lambda(lambda x: x**2)(output) subject_model = Model(bert_model.inputs, subject_preds) # 传入subject,预测object output = bert_model.model.layers[-2].get_output_at(-1) subject = Lambda(extrac_subject)([output, subject_ids]) output = LayerNormalization(conditional=True)([output, subject]) output = Dense(units=len(predicate2id) * 2, activation='sigmoid', kernel_initializer=bert_model.initializer)(output) output = Lambda(lambda x: x**4)(output) object_preds = Reshape((-1, len(predicate2id), 2))(output) object_model = Model(bert_model.model.inputs + [subject_ids], object_preds) # 训练模型 train_model = Model( bert_model.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds]) mask = bert_model.model.get_layer('Embedding-Token').output_mask mask = K.cast(mask, K.floatx()) subject_loss = K.binary_crossentropy(subject_labels, subject_preds) subject_loss = K.mean(subject_loss, 2) subject_loss = K.sum(subject_loss * mask) / K.sum(mask) object_loss = K.binary_crossentropy(object_labels, object_preds) object_loss = K.sum(K.mean(object_loss, 3), 2) object_loss = K.sum(object_loss * mask) / K.sum(mask) train_model.add_loss(subject_loss + object_loss) optimizer = Adam(Config.learning_rate) train_model.compile(optimizer=optimizer) return train_model, subject_model, object_model
def build(self, input_shape): super(ResidualGatedConv1D, self).build(input_shape) self.conv1d = Conv1D( filters=self.filters * 2, kernel_size=self.kernel_size, dilation_rate=self.dilation_rate, padding='same', ) self.layernorm = LayerNormalization() if self.filters != input_shape[-1]: self.dense = Dense(self.filters, use_bias=False) self.alpha = self.add_weight( name='alpha', shape=[1], initializer='zeros' )
def build_model(): """ 搭建模型主体。 :return: 模型对象 """ with SESS.as_default(): with SESS.graph.as_default(): # 构建bert模型主体 bert_model = build_transformer_model( config_path=bert_config.config_path, checkpoint_path=bert_config.checkpoint_path, return_keras_model=False, model=bert_config.model_type ) # l为模型内部的层名,格式为--str for l in bert_model.layers: bert_model.model.get_layer(l).trainable = True # 动词起始,终止下标,keras会自动补充batch这一维度 # [batch_size, 1] trigger_start_index = Input(shape=(1,)) trigger_end_index = Input(shape=(1,)) # 将动词下标对应位置的子向量抽取出来并计算均值 k1v = Lambda(seq_gather)([bert_model.model.output, trigger_start_index]) k2v = Lambda(seq_gather)([bert_model.model.output, trigger_end_index]) kv = Average()([k1v, k2v]) # 融合动词词向量的句子张量 t = LayerNormalization(conditional=True)([bert_model.model.output, kv]) # 取出[CLS]对应的向量用来做分类 t = Lambda(lambda x: x[:, 0])(t) # 预测事件状态 state_out_put = Dense(3, activation='softmax')(t) # 构建状态预测模型 state_model = Model(bert_model.model.inputs + [trigger_start_index, trigger_end_index], state_out_put) # 设置学习率、优化器、损失函数以及每个批次的评估指标 state_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(state_train_config.learning_rate), metrics=['accuracy']) state_model.summary() return state_model
def get_state_model(): """ 构建事件状态模型,加载模型参数,返回模型对象 使用bert输出融合动词下标预测事件状态 :return: state_model """ with state_sess.as_default(): with state_sess.graph.as_default(): # 构建bert模型主体 bert_model = build_transformer_model( config_path=bert_config.config_path, return_keras_model=False, model=bert_config.model_type ) # 动词下标输入 trigger_start_index = Input(shape=(1,)) trigger_end_index = Input(shape=(1,)) # 获取动词向量 k1v = Lambda(seq_gather)([bert_model.model.output, trigger_start_index]) k2v = Lambda(seq_gather)([bert_model.model.output, trigger_end_index]) kv = Average()([k1v, k2v]) # 将动词向量与bert模型输出也就是句子张量进行融合 t = LayerNormalization(conditional=True)([bert_model.model.output, kv]) # 取出[CLS]对应的向量用来做分类 t = Lambda(lambda x: x[:, 0])(t) # 预测事件状态 state_out_put = Dense(3, activation='softmax')(t) # 主模型 state_model = Model(bert_model.model.inputs + [trigger_start_index, trigger_end_index], state_out_put) # 加载模型 logger.info("开始加载事件状态模型参数。。。") state_model.load_weights(pre_config.event_state_model_path) logger.info("事件状态模型参数加载完成!") return state_model
return_keras_model=False, ) # 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert.initializer)(bert.model.output) subject_preds = Lambda(lambda x: x**2)(output) subject_model = Model(bert.model.inputs, subject_preds) # 传入subject,预测object # 通过Conditional Layer Normalization将subject融入到object的预测中 output = bert.model.layers[-2].get_output_at(-1) subject = Lambda(extrac_subject)([output, subject_ids]) output = LayerNormalization(conditional=True)([output, subject]) output = Dense(units=len(predicate2id) * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Lambda(lambda x: x**4)(output) object_preds = Reshape((-1, len(predicate2id), 2))(output) object_model = Model(bert.model.inputs + [subject_ids], object_preds) # 训练模型 train_model = Model( bert.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds]) # train_model.summary()
def build_model(): """ 调用模型参数,搭建事件抽取模型主体,先搭建触发词模型,然后围绕触发词下标搭建其他论元模型。 :return: 各个论元模型对象 """ with SESS.as_default(): with SESS.graph.as_default(): # 构建bert模型主体 bert_model = build_transformer_model( config_path=bert_config.config_path, checkpoint_path=bert_config.checkpoint_path, return_keras_model=False, model=bert_config.model_type) # l为模型内部的层名,格式为--str for l in bert_model.layers: bert_model.model.get_layer(l).trainable = True # 搭建模型 # keras会自动对所有的占位张量添加batch_size维度 # 动词输入 (batch_size, seq_len) trigger_start_in = Input(shape=(None, )) trigger_end_in = Input(shape=(None, )) # 动词下标输入 (batch_size, seq_len) trigger_index_start_in = Input(shape=(1, )) trigger_index_end_in = Input(shape=(1, )) # 宾语输入 (batch_size, seq_len) object_start_in = Input(shape=(None, )) object_end_in = Input(shape=(None, )) # 主语输入 (batch_size, seq_len) subject_start_in = Input(shape=(None, )) subject_end_in = Input(shape=(None, )) # 地点输入 (batch_size, seq_len) loc_start_in = Input(shape=(None, )) loc_end_in = Input(shape=(None, )) # 时间输入 (batch_size, seq_len) time_start_in = Input(shape=(None, )) time_end_in = Input(shape=(None, )) # 否定词输入 (batch_size, seq_len) negative_start_in = Input(shape=(None, )) negative_end_in = Input(shape=(None, )) # 将输入的占位符赋值给相应的变量(此处只是在使用时方便,没有其他的模型结构意义) # 动词输入 trigger_start, trigger_end = trigger_start_in, trigger_end_in # 动词下标 trigger_index_start, trigger_index_end = trigger_index_start_in, trigger_index_end_in # 宾语输入 object_start, object_end = object_start_in, object_end_in # 主语输入 subject_start, subject_end = subject_start_in, subject_end_in # 地点输入 loc_start, loc_end = loc_start_in, loc_end_in # 时间输入 time_start, time_end = time_start_in, time_end_in # 否定词输入 negative_start, negative_end = negative_start_in, negative_end_in # bert_model.model.inputs为列表格式,含有两个张量,[token_ids(batch, seq_len), segment_ids(batch, seq_len)] # mask操作,将bert模型的输入的token_ids序列,进行维度扩充[batch_size, seq_len, 1], # 然后将初始填充为0的地方全部都用0代替,非0的地方都用1占位, # 这是为后边计算损失做准备,防止计算损失时,前期填充为0的位置也进行反向传播 mask = Lambda(lambda x: K.cast( K.greater(K.expand_dims(x[0], 2), 0), 'float32'))( bert_model.model.inputs) # 计算动词输出的起始终止标签,bert_model.model.output [batch_size, seq_len, 768] trigger_start_out = Dense(1, activation='sigmoid')( bert_model.model.output) trigger_end_out = Dense(1, activation='sigmoid')( bert_model.model.output) # 预测trigger动词的模型 trigger_model = Model(bert_model.model.inputs, [trigger_start_out, trigger_end_out]) # 将动词下标对应位置的子向量抽取出来并计算均值 k1v = Lambda(seq_gather)( [bert_model.model.output, trigger_index_start]) k2v = Lambda(seq_gather)( [bert_model.model.output, trigger_index_end]) kv = Average()([k1v, k2v]) # 融合动词词向量的句子张量,用来作为预测其它论元部分的向量 t = LayerNormalization(conditional=True)( [bert_model.model.output, kv]) # 宾语模型输出 object_start_out = Dense(1, activation='sigmoid')(t) object_end_out = Dense(1, activation='sigmoid')(t) # 主语模型输出 subject_start_out = Dense(1, activation='sigmoid')(t) subject_end_out = Dense(1, activation='sigmoid')(t) # 地点模型输出 loc_start_out = Dense(1, activation='sigmoid')(t) loc_end_out = Dense(1, activation='sigmoid')(t) # 时间模型输出 time_start_out = Dense(1, activation='sigmoid')(t) time_end_out = Dense(1, activation='sigmoid')(t) # 否定词模型输出 negative_start_out = Dense(1, activation='sigmoid')(t) negative_end_out = Dense(1, activation='sigmoid')(t) # 输入text和trigger,预测object object_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [object_start_out, object_end_out]) # 输入text和trigger,预测subject subject_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [subject_start_out, subject_end_out]) # 输入text和trigger,预测loc loc_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [loc_start_out, loc_end_out]) # 输入text和trigger,预测time time_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [time_start_out, time_end_out]) # 否定词模型 negative_model = Model( bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [negative_start_out, negative_end_out]) # 主模型 train_model = Model( bert_model.model.inputs + [ trigger_start_in, trigger_end_in, trigger_index_start_in, trigger_index_end_in, object_start_in, object_end_in, subject_start_in, subject_end_in, loc_start_in, loc_end_in, time_start_in, time_end_in, negative_start_in, negative_end_in ], [ trigger_start_out, trigger_end_out, object_start_out, object_end_out, subject_start_out, subject_end_out, loc_start_out, loc_end_out, time_start_out, time_end_out, negative_start_out, negative_end_out ]) # 扩充维度, 构造成与mask矩阵相同的结构,方便后续计算模型各部分损失 trigger_start = K.expand_dims(trigger_start, 2) trigger_end = K.expand_dims(trigger_end, 2) object_start = K.expand_dims(object_start, 2) object_end = K.expand_dims(object_end, 2) subject_start = K.expand_dims(subject_start, 2) subject_end = K.expand_dims(subject_end, 2) loc_start = K.expand_dims(loc_start, 2) loc_end = K.expand_dims(loc_end, 2) time_start = K.expand_dims(time_start, 2) time_end = K.expand_dims(time_end, 2) negative_start = K.expand_dims(negative_start, 2) negative_end = K.expand_dims(negative_end, 2) # 构造模型损失函数,使用mask矩阵将前期填充为0的位置全部掩掉,不进行反向传播。 # 动词损失 trigger_start_loss = K.binary_crossentropy(trigger_start, trigger_start_out) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 trigger_start_loss = K.sum(trigger_start_loss * mask) / K.sum(mask) trigger_end_loss = K.binary_crossentropy(trigger_end, trigger_end_out) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 trigger_end_loss = K.sum(trigger_end_loss * mask) / K.sum(mask) # 宾语损失 object_start_loss = K.sum( K.binary_crossentropy(object_start, object_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 object_start_loss = K.sum(object_start_loss * mask) / K.sum(mask) object_end_loss = K.sum( K.binary_crossentropy(object_end, object_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 object_end_loss = K.sum(object_end_loss * mask) / K.sum(mask) # 主语损失 subject_start_loss = K.sum( K.binary_crossentropy(subject_start, subject_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 subject_start_loss = K.sum(subject_start_loss * mask) / K.sum(mask) subject_end_loss = K.sum( K.binary_crossentropy(subject_end, subject_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 subject_end_loss = K.sum(subject_end_loss * mask) / K.sum(mask) # 地点损失 loc_start_loss = K.sum( K.binary_crossentropy(loc_start, loc_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 loc_start_loss = K.sum(loc_start_loss * mask) / K.sum(mask) loc_end_loss = K.sum(K.binary_crossentropy(loc_end, loc_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 loc_end_loss = K.sum(loc_end_loss * mask) / K.sum(mask) # 时间损失 time_start_loss = K.sum( K.binary_crossentropy(time_start, time_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 time_start_loss = K.sum(time_start_loss * mask) / K.sum(mask) time_end_loss = K.sum(K.binary_crossentropy( time_end, time_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 time_end_loss = K.sum(time_end_loss * mask) / K.sum(mask) # 否定词损失 negative_start_loss = K.sum( K.binary_crossentropy(negative_start, negative_start_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 negative_start_loss = K.sum( negative_start_loss * mask) / K.sum(mask) negative_end_loss = K.sum( K.binary_crossentropy(negative_end, negative_end_out)) # 使用mask矩阵,将前期填充为0的位置掩掉,不进行反向传播 negative_end_loss = K.sum(negative_end_loss * mask) / K.sum(mask) # 合并损失 loss = (trigger_start_loss + trigger_end_loss) + ( object_start_loss + object_end_loss) + (subject_start_loss + subject_end_loss) + ( loc_start_loss + loc_end_loss) + (time_start_loss + time_end_loss) + ( negative_start_loss + negative_end_loss) train_model.add_loss(loss) train_model.compile( optimizer=Adam(extract_train_config.learning_rate)) train_model.summary() return trigger_model, subject_model, object_model, time_model, loc_model, negative_model, train_model
# 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert.initializer)( bert.model.output) #[? ? 768]->[? ? 2] subject_preds = Lambda(lambda x: x**2)(output) subject_model = Model(bert.model.inputs, subject_preds) # text -> sub # 传入subject,预测object # 通过Conditional Layer Normalization将subject融入到object的预测中 output = bert.model.layers[-2].get_output_at(-1) subject = Lambda(extrac_subject)( [output, subject_ids]) # output[? ? 768] subid[? 2] ->[? 768*2] output = LayerNormalization(conditional=True)( [output, subject]) #[? ? 768] mean std 依赖sub-hid output = Dense(units=len(predicate2id) * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Reshape( (-1, len(predicate2id), 2))(output) #[? ? predicate49*2]->[? ? 49 2] object_preds = Lambda(lambda x: x**4)(output) object_model = Model(bert.model.inputs + [subject_ids], object_preds) #sub,text -> obj,predicate # 训练模型 train_model = Model( bert.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds])
def get_extract_model(): """ 构建事件抽取模型结构,加载模型参数,返回模型对象 1、使用bert输出预测动词下标 2、使用bert输出融合动词下标预测事件时间、地点、主语、宾语、否定词 :return: 各个部分的模型对象 """ with extract_sess.as_default(): with extract_sess.graph.as_default(): # 构建bert模型主体 bert_model = build_transformer_model( config_path=bert_config.config_path, return_keras_model=False, model=bert_config.model_type ) # 搭建模型 # 动词输入 trigger_start_in = Input(shape=(None,)) trigger_end_in = Input(shape=(None,)) # 动词下标输入 trigger_index_start_in = Input(shape=(1,)) trigger_index_end_in = Input(shape=(1,)) # 宾语输入 object_start_in = Input(shape=(None,)) object_end_in = Input(shape=(None,)) # 主语输入 subject_start_in = Input(shape=(None,)) subject_end_in = Input(shape=(None,)) # 地点输入 loc_start_in = Input(shape=(None,)) loc_end_in = Input(shape=(None,)) # 时间输入 time_start_in = Input(shape=(None,)) time_end_in = Input(shape=(None,)) # 否定词输入 negative_start_in = Input(shape=(None,)) negative_end_in = Input(shape=(None,)) # 将模型外传入的下标赋值给模型内部变量(只是为了将模型中应用与构建Model的输入区分开来) trigger_index_start, trigger_index_end = trigger_index_start_in, trigger_index_end_in trigger_start_out = Dense(1, activation='sigmoid')(bert_model.model.output) trigger_end_out = Dense(1, activation='sigmoid')(bert_model.model.output) # 预测trigger动词的模型 trigger_model = Model(bert_model.model.inputs, [trigger_start_out, trigger_end_out]) # 按照动词下标采集字向量 k1v = Lambda(seq_gather)([bert_model.model.output, trigger_index_start]) k2v = Lambda(seq_gather)([bert_model.model.output, trigger_index_end]) kv = Average()([k1v, k2v]) # 使用归一化融合动词词向量与句子张量 t = LayerNormalization(conditional=True)([bert_model.model.output, kv]) # 宾语模型输出 object_start_out = Dense(1, activation='sigmoid')(t) object_end_out = Dense(1, activation='sigmoid')(t) # 主语模型输出 subject_start_out = Dense(1, activation='sigmoid')(t) subject_end_out = Dense(1, activation='sigmoid')(t) # 地点模型输出 loc_start_out = Dense(1, activation='sigmoid')(t) loc_end_out = Dense(1, activation='sigmoid')(t) # 时间模型输出 time_start_out = Dense(1, activation='sigmoid')(t) time_end_out = Dense(1, activation='sigmoid')(t) # 否定词模型输出 negative_start_out = Dense(1, activation='sigmoid')(t) negative_end_out = Dense(1, activation='sigmoid')(t) # 输入text和trigger,预测object object_model = Model(bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [object_start_out, object_end_out]) # 输入text和trigger,预测subject subject_model = Model(bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [subject_start_out, subject_end_out]) # 输入text和trigger,预测loc loc_model = Model(bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [loc_start_out, loc_end_out]) # 输入text和trigger,预测time time_model = Model(bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [time_start_out, time_end_out]) # 输入text和trigger,预测否定词negative negative_model = Model(bert_model.model.inputs + [trigger_index_start_in, trigger_index_end_in], [negative_start_out, negative_end_out]) # 主模型 train_model = Model( bert_model.model.inputs + [trigger_start_in, trigger_end_in, trigger_index_start_in, trigger_index_end_in, object_start_in, object_end_in, subject_start_in, subject_end_in, loc_start_in, loc_end_in, time_start_in, time_end_in, negative_start_in, negative_end_in], [trigger_start_out, trigger_end_out, object_start_out, object_end_out, subject_start_out, subject_end_out, loc_start_out, loc_end_out, time_start_out, time_end_out, negative_start_out, negative_end_out]) # 加载事件抽取模型参数 logger.info("开始加载事件抽取模型参数。。。") train_model.load_weights(pre_config.event_extract_model_path) logger.info("事件抽取模型参数加载完成!") return trigger_model, object_model, subject_model, loc_model, time_model, negative_model
## embed predicate predicate_n = 49 emb_l = Embedding(predicate_n, 32, name='p_emb') # not tf.keras.layer, | keras.layer.xx predicate_emb = emb_l(predicate_id) output = bert.model.layers[-2].get_output_at(-1) subject = Lambda(extrac_subject)( [output, subject_ids]) # output[? ? 768] subid[? 2] ->subject[? 768*2] ### #subject_predicate=K.concatenate([subject,predicate_emb[:,0,:]],axis=-1) #subject_predicate=Concatenate(-1)([subject,predicate_emb[:,0,:]]) subject_predicate = Lambda(concat)([subject, predicate_emb]) output = LayerNormalization(conditional=True, name='specialNorm')( [output, subject_predicate]) #[? ? 768] mean std 依赖sub-hid output = Dense( units=2, #units=len(predicate2id) * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) #output = Reshape((-1, 2))(output) #[? ? 2] object_preds = Lambda(lambda x: x**4)(output) #object_model = Model(bert.model.inputs + [subject_ids], object_preds) #sub,text -> obj,predicate # 训练模型 # train_model = Model(bert.model.inputs + [subject_labels, subject_ids, object_labels], # [subject_preds, object_preds]) train_model = Model( bert.model.inputs + [subject_ids, predicate_id, object_labels],
subject_preds = Lambda(lambda x: x**2)(output) # 把概率平方一下,缓解不平衡问题 # subject_model对应的model # bert.model.inputs是Input-Token和Input-Segment:0 = {Tensor} Tensor("Input-Token:0", shape=(?, ?), dtype=float32), 1 = {Tensor} Tensor("Input-Segment:0", shape=(?, ?), dtype=float32) subject_model = Model(bert.model.inputs, subject_preds) # 3.2 传入subject_ids,通过指针来影响object的预测 # 通过Conditional Layer Normalization将subject融入到object的预测中 output = bert.model.layers[-2].get_output_at( -1 ) # (?, ?, 768) bert中最后LN不要做,取出来自己用CLN做 获取某一个网络层的输出;get_output_at:keras函数专用共享编码层 subject = Lambda(extrac_subject)( [output, subject_ids]) # shape=(?, 1536) 根据subject_ids从output中取出subject首尾的向量表征 output = LayerNormalization(conditional=True)( [output, subject] ) # 带上subject的预测 本质是用subject(?, 1536)通过Dense变成(?,1,768)去影响归一化中的beta和gamma,就是影响缩放偏移的那个就可以了 # 输入s_ids 加上对应的p 去预测出o 那这里其实跟p的具体值没啥关系 output = Dense( units=len(predicate2id) * 2, # dense全连接层输出概率 activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Lambda(lambda x: x**4)(output) object_preds = Reshape( (-1, len(predicate2id), 2))(output) # (?, ?, 49, 2) -1表示不知道第一维的数字的多少,根据后面确定好了之后自动计算出来 # object_model对应的model object_model = Model(bert.model.inputs + [subject_ids], object_preds) # 定义loss,把subject和object的预测loss相加即可