def load_from_pretrained(pretrained_path, lr, seq_len, optimizer_type, decay_rate, warmup_steps, decay_steps): config_path = os.path.join(pretrained_path, 'bert_config.json') checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt') model, config = build_model_from_config( config_path, training=False, trainable=True, output_layer_num=1, seq_len=seq_len, ) load_model_weights_from_checkpoint(model, config, checkpoint_path, training=False) inputs = model.inputs outputs = model.outputs transformer_output = outputs[0] logits = keras.layers.Dense( units=2, trainable=True, name='logits', kernel_initializer=TruncatedNormal(stddev=0.02))(transformer_output) start_logits = Lambda(lambda x: x[:, :, 0], name='start-logits')(logits) end_logits = Lambda(lambda x: x[:, :, 1], name='end-logits')(logits) model = keras.models.Model(inputs=inputs, outputs=[start_logits, end_logits]) if optimizer_type == 'decay': optimizer = Adam(lr=lr, amsgrad=True, decay=decay_rate) else: optimizer = AdamWD(lr=lr, amsgrad=True, warmup_steps=warmup_steps, decay_steps=decay_steps) model.compile( optimizer=optimizer, loss=custom_loss, ) model.summary() return model
def build_model_from_config( config_file, checkpoint_file, training=False, trainable=False, seq_len=None, ): """Build the model from config file. :param config_file: The path to the JSON configuration file. :param training: If training, the whole model will be returned. :param trainable: Whether the model is trainable. :param seq_len: If it is not None and it is shorter than the value in the config file, the weights in position embeddings will be sliced to fit the new length. :return: model and config """ with open(config_file, 'r') as reader: config = json.loads(reader.read()) if seq_len is not None: config['max_position_embeddings'] = min( seq_len, config['max_position_embeddings']) if trainable is None: trainable = training model = get_model( token_num=config['vocab_size'], pos_num=config['max_position_embeddings'], seq_len=config['max_position_embeddings'], embed_dim=config['hidden_size'], transformer_num=config['num_hidden_layers'], head_num=config['num_attention_heads'], feed_forward_dim=config['intermediate_size'], training=False, trainable=True, ) # SetLearningRate(model,0.00001,True) inputs, outputs = model t_in = Input(shape=(None, )) s_in = Input(shape=(None, )) k1_in = Input(shape=(1, )) k2_in = Input(shape=(1, )) o1_in = Input(shape=(None, )) o2_in = Input(shape=(None, )) t, s, k1, k2, o1, o2 = t_in, s_in, k1_in, k2_in, o1_in, o2_in mask = Lambda( lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))( inputs[0]) outputs = Dropout(0.5)(outputs) attention = TimeDistributed(Dense(1, activation='tanh'))(outputs) attention = MaskFlatten()(attention) attention = Activation('softmax')(attention) attention = MaskRepeatVector(config['hidden_size'])(attention) attention = MaskPermute([2, 1])(attention) sent_representation = multiply([outputs, attention]) attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation) t_dim = K.int_shape(outputs)[-1] h = Lambda(seq_and_vec, output_shape=(None, t_dim * 2))([outputs, attention]) conv1 = MaskedConv1D()(h) ps = Dense(3, activation='softmax')(conv1) subject_model = keras.models.Model([inputs[0], inputs[1]], [ps]) # 预测subject的模型 ##预测o1,o2 k1 = Lambda(seq_gather, output_shape=(t_dim, ))([outputs, k1]) k2 = Lambda(seq_gather, output_shape=(t_dim, ))([outputs, k2]) k = Concatenate()([k1, k2]) h = Lambda(seq_and_vec, output_shape=(None, t_dim * 2))([outputs, attention]) h = Lambda(seq_and_vec, output_shape=(None, t_dim * 4))([h, k]) h = Concatenate(axis=-1)([h, conv1]) h = MaskedConv1D()(h) po1 = Dense(num_classes + 1, activation='softmax')(h) po2 = Dense(num_classes + 1, activation='softmax')(h) object_model = keras.models.Model( [inputs[0], inputs[1], k1_in, k2_in], [po1, po2]) # 输入text和subject,预测object及其关系 train_model = keras.models.Model( inputs=[inputs[0], inputs[1], s_in, k1_in, k2_in, o1_in, o2_in], outputs=[ps, po1, po2]) s_loss = K.sparse_categorical_crossentropy(s, ps) s_loss = K.sum(s_loss * mask[:, :, 0]) / K.sum(mask) o1_loss = K.sparse_categorical_crossentropy(o1, po1) o1_loss = K.sum(o1_loss * mask[:, :, 0]) / K.sum(mask) o2_loss = K.sparse_categorical_crossentropy(o2, po2) o2_loss = K.sum(o2_loss * mask[:, :, 0]) / K.sum(mask) train_model.add_loss(s_loss + o1_loss + o2_loss) train_model.summary() train_model.compile(optimizer=keras.optimizers.Adam(lr=3e-5), ) load_model_weights_from_checkpoint(train_model, config, checkpoint_file, training) return train_model, subject_model, object_model
def build_model_from_config( config_file, checkpoint_file, training=False, trainable=False, seq_len=None, ): """Build the model from config file. :param config_file: The path to the JSON configuration file. :param training: If training, the whole model will be returned. :param trainable: Whether the model is trainable. :param seq_len: If it is not None and it is shorter than the value in the config file, the weights in position embeddings will be sliced to fit the new length. :return: model and config """ with open(config_file, 'r') as reader: config = json.loads(reader.read()) if seq_len is not None: config['max_position_embeddings'] = min( seq_len, config['max_position_embeddings']) if trainable is None: trainable = training model = get_model( token_num=config['vocab_size'], pos_num=config['max_position_embeddings'], seq_len=config['max_position_embeddings'], embed_dim=config['hidden_size'], transformer_num=config['num_hidden_layers'], head_num=config['num_attention_heads'], feed_forward_dim=config['intermediate_size'], training=False, trainable=True, ) inputs, outputs = model bio_label = Input(shape=(maxlen, )) event = Input(shape=(1, )) mask = Lambda( lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))( inputs[0]) event_embedding = Embedding(len(event2id), hidden_size, mask_zero=True)(event) outputs = Dropout(0.15)(outputs) attention = TimeDistributed(Dense(1, activation='tanh'))(outputs) attention = MaskFlatten()(attention) attention = Activation('softmax')(attention) attention = MaskRepeatVector(config['hidden_size'])(attention) attention = MaskPermute([2, 1])(attention) sent_representation = multiply([outputs, attention]) attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation) t_dim = K.int_shape(outputs)[-1] bert_attention = Lambda(seq_and_vec, output_shape=(None, t_dim * 2))([outputs, attention]) cnn1 = MaskedConv1D(filters=hidden_size, kernel_size=3, activation='relu', padding='same')(bert_attention) event_bc = Lambda(lambda input: input[0] * 0 + input[1])( [cnn1, event_embedding]) con_cnn_event = Concatenate(axis=-1)([cnn1, event_bc]) dens1 = Dense(hidden_size, activation='relu', use_bias=True)(con_cnn_event) #BIOE bio_pred = Dense(4, activation='softmax')(dens1) entity_model = keras.models.Model([inputs[0], inputs[1], event], [bio_pred]) # 预测subject的模型 train_model = keras.models.Model([inputs[0], inputs[1], bio_label, event], [bio_pred]) loss = K.sparse_categorical_crossentropy(bio_label, bio_pred) loss = K.sum(loss * mask[:, :, 0]) / K.sum(mask) train_model.add_loss(loss) train_model.summary() train_model.compile(optimizer=keras.optimizers.Adam(lr=3e-5), ) load_model_weights_from_checkpoint(train_model, config, checkpoint_file, training) return train_model, entity_model
bert = get_model( token_num=config['vocab_size'], pos_num=config['max_position_embeddings'], seq_len=seq_len, embed_dim=config['hidden_size'], transformer_num=config['num_hidden_layers'], head_num=config['num_attention_heads'], feed_forward_dim=config['intermediate_size'], feed_forward_activation=config['hidden_act'], training=None, trainable=True, output_layer_num=1, ) inputs, outputs = bert print(type(bert), type(outputs)) load_model_weights_from_checkpoint(outputs, config, model_path + "bert_model.ckpt") x1 = Input(shape=(None, )) x2 = Input(shape=(None, )) bert_out = outputs.output([x1, x2]) lstm_out = Bidirectional( LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(bert_out) crf_out = CRF(8, sparse_target=True)(lstm_out) model = Model(inputs=[x1, x2], outputs=crf_out) model.summary()
def build_csc_model(max_seq_len): # build detect model with open(paths.config, 'r') as reader: config = json.load(reader) if max_seq_len is not None: config['max_position_embeddings'] = min( max_seq_len, config['max_position_embeddings']) seq_len = config["max_position_embeddings"] inputs = get_inputs(seq_len) # [input_ids, segment_ids, input_mask] token_num = len(token_dict) embed_dim = config["hidden_size"] # config["num_hidden_layers"] = 1 token_embedding_lookup = TokenEmbedding( input_dim=token_num, output_dim=embed_dim, mask_zero=True, trainable=True, name='Embedding-Token', ) segment_embedding_lookup = keras.layers.Embedding( input_dim=2, output_dim=embed_dim, trainable=True, name='Embedding-Segment', ) position_embed_layer = PositionEmbedding( input_dim=seq_len, output_dim=embed_dim, mode=PositionEmbedding.MODE_ADD, trainable=True, name='Embedding-Position', ) token_emb, embed_weights = token_embedding_lookup(inputs[0]) seg_emb = segment_embedding_lookup(inputs[1]) add = keras.layers.Add(name='Embedding-Token-Segment') embeddings = position_embed_layer(add([token_emb, seg_emb])) # embeddings = keras.layers.Embedding(input_dim=token_num, output_dim=embed_dim, mask_zero=True)(inputs[0]) mask = K.cast(inputs[2], dtype='bool') x = keras.layers.Bidirectional(keras.layers.GRU( 256, return_sequences=True))(embeddings, mask=mask) err_prob = keras.layers.Dense(1, activation='sigmoid', name="error_prob")( x) # shape: (None, seq_len, 1) # detect_model = keras.Model(inputs, err_prob) # detect_model.summary() # build correct model num_classes = char_end_index - char_start_index + 2 # add extra id representing the oov original char mask_ids = K.constant(mask_id, shape=(1, max_seq_len)) mask_emb, _ = token_embedding_lookup(mask_ids) soft_emb = err_prob * mask_emb + ( 1. - err_prob) * token_emb # broadcast, shape(None, seq_len, emb_size) new_embeddings = position_embed_layer(add([soft_emb, seg_emb])) bert_output, bert = get_model_from_embedding( inputs, new_embeddings, transformer_num=config['num_hidden_layers'], head_num=config['num_attention_heads'], feed_forward_dim=config['intermediate_size'], feed_forward_activation=config['hidden_act']) load_model_weights_from_checkpoint(bert, config, paths.checkpoint) output = keras.layers.Dense(num_classes, activation='softmax', name="correct_prob")(bert_output + embeddings) error_prob = err_prob[:, :, 0] # squeeze correct_model = keras.Model(inputs, [output, error_prob]) # correct_model.summary() mistake_labels = keras.layers.Input(shape=(seq_len, ), dtype='float32', name="mistake_labels") char_labels = keras.layers.Input(shape=(seq_len, ), dtype='int32', name="char_labels") # 训练模型 train_model = keras.Model(inputs=inputs + [mistake_labels, char_labels], outputs=[output, error_prob]) # 去掉头部的[CLS]和尾部的[SEP] mask_sum = K.sum(inputs[2], axis=-1) diff = K.one_hot(mask_sum - 1, seq_len) + K.one_hot(0, seq_len) mask_float = K.cast_to_floatx(inputs[2]) - diff args_for_loss = (mask_float, char_labels, mistake_labels, error_prob, output) loss = keras.layers.Lambda(custom_loss)(args_for_loss) train_model.add_loss(loss) train_model.summary() train_model.compile(optimizer=keras.optimizers.Adam(learning_rate)) return train_model, correct_model
def on_train_begin(self, logs=None): load_model_weights_from_checkpoint(self.bert, config, self.checkpoint_path)