def build_xlnet(args): # Load pretrained model model = load_trained_model_from_checkpoint( config_path=args.config_path, checkpoint_path=args.model_path, batch_size=args.batch_size, memory_len=0, target_len=args.maxlen, in_train_phase=False, attention_type=ATTENTION_TYPE_BI, ) # Build classification model last = model.output extract = Extract(index=-1, name='Extract')(last) output = keras.layers.Dense(units=args.nclass, activation='softmax', name='Softmax')(extract) model = keras.models.Model(inputs=model.inputs, outputs=output) model.summary() # Fit model model.compile( optimizer=RAdam(args.lr), loss='categorical_crossentropy', metrics=['accuracy'], ) print(model.summary()) return model
def build_model(): model = load_trained_model_from_checkpoint( config_path=paths.config, checkpoint_path=paths.model, batch_size=BATCH_SIZE, memory_len=MEMORY_LEN, target_len=TEXT_LEN, in_train_phase=False, attention_type=ATTENTION_TYPE_BI) # 加载预训练权重 # Build classification model last = model.output extract = Extract(index=-1, name='Extract')(last) dense = keras.layers.Dense(units=768, name='Dense')(extract) norm = keras.layers.BatchNormalization(name='Normal')(dense) output = keras.layers.Dense(units=2, activation='softmax', name='Softmax')(norm) model = keras.models.Model(inputs=model.inputs, outputs=output) model.compile( optimizer=Adam(learning_rate=LEARNING_RATE), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'], ) return model
def test_sample(self): input_layer = keras.layers.Input( shape=(3, 4), name='Input', ) extract_layer = Extract( index=1, name='Extract' )(input_layer) model = keras.models.Model( inputs=input_layer, outputs=extract_layer, ) model.compile( optimizer='adam', loss='mse', metrics={}, ) model.summary() inputs = np.asarray([[ [0.1, 0.2, 0.3, 0.4], [-0.1, 0.2, -0.3, 0.4], [0.1, -0.2, 0.3, -0.4], ]]) predict = model.predict(inputs) expected = np.asarray([[0.1, 0.2, 0.3, 0.4]]) self.assertTrue(np.allclose(expected, predict), predict)
def build_bert(model, poolings=None, output_layer_num=1): """Extract embeddings from texts. :param model: Path to the checkpoint or built model without MLM and NSP. :param texts: Iterable texts. :param poolings: Pooling methods. Word embeddings will be returned if it is None. Otherwise concatenated pooled embeddings will be returned. :param vocabs: A dict should be provided if model is built. :param cased: Whether it is cased for tokenizer. :param batch_size: Batch size. :param cut_embed: The computed embeddings will be cut based on their input lengths. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `model` is a path to checkpoint. :return: A list of numpy arrays representing the embeddings. """ model = get_pretrained(PretrainedList.multi_cased_base) if isinstance(model, (str, type(u''))): paths = get_checkpoint_paths(model) model = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, output_layer_num=output_layer_num, ) outputs = [] if poolings is not None: if isinstance(poolings, (str, type(u''))): poolings = [poolings] # outputs = [] for pooling in poolings: if pooling == POOL_NSP: outputs.append( Extract(index=0, name='Pool-NSP')(model.outputs[0])) elif pooling == POOL_MAX: outputs.append( MaskedGlobalMaxPool1D(name='Pool-Max')(model.outputs[0])) elif pooling == POOL_AVE: outputs.append( keras.layers.GlobalAvgPool1D(name='Pool-Ave')( model.outputs[0])) else: raise ValueError('Unknown pooling method: {}'.format(pooling)) # print(outputs) if len(outputs) == 1: outputs = outputs[0] else: outputs = keras.layers.Concatenate(name='Concatenate')(outputs) outputs = Lambda(bert_output_sum)(outputs) # model = keras.models.Model(inputs=model.inputs, outputs=outputs) return model.inputs, outputs
def get_finetune_model(): word2id_dict, id2word_dict = utils.get_word_id_map(word_id_map_file_path) input_layer, transformed = keras_bert.my_get_model( token_num=len(word2id_dict), head_num=hp.head_num, transformer_num=hp.transformer_num, embed_dim=hp.embed_dim, feed_forward_dim=hp.feed_forward_dim, dropout_rate=hp.dropout_rate, seq_len=hp.seq_len, pos_num=hp.seq_len, attention_activation='gelu', training=False, ### !!!!!!!一定不能忘记设置为False!!!!!!!!! trainable=True) # output_layer = model.inputs[:2] # dense = model.get_layer('Encoder-2-FeedForward-Norm').output # output_layer = keras.layers.Dense(units=2, activation='relu')(dense) extract_layer = Extract(index=0, name='Extract')(transformed) # coor_dense = keras.layers.Dense(units=embed_dim, activation="relu", name="coor_dense")(transformed) output_layer = keras.layers.Dense(units=2, activation="relu", name="coor_output")(extract_layer) model = keras.models.Model(inputs=input_layer, outputs=output_layer) return model
train_seq = generate_sequence(train_path) dev_seq = generate_sequence(dev_path) # Load pretrained model model = load_trained_model_from_checkpoint( config_path=paths.config, checkpoint_path=paths.model, batch_size=BATCH_SIZE, memory_len=0, target_len=SEQ_LEN, in_train_phase=False, attention_type=ATTENTION_TYPE_BI, ) # Build classification model last = Extract(index=-1, name='Extract')(model.output) dense = keras.layers.Dense(units=768, activation='tanh', name='Dense')(last) dropout = keras.layers.Dropout(rate=0.1, name='Dropout')(dense) output = keras.layers.Dense(units=2, activation='softmax', name='Softmax')(dropout) model = keras.models.Model(inputs=model.inputs, outputs=output) model.summary() # Fit model if os.path.exists(MODEL_NAME): model.load_weights(MODEL_NAME) model.compile( optimizer=keras.optimizers.Adam(lr=3e-5), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'],
def build_albert(token_num, pos_num=512, seq_len=512, embed_dim=128, hidden_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layers=None): """Get ALBERT model. See: https://arxiv.org/pdf/1909.11942.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param hidden_dim: Dimensions of hidden layers. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layers: A list of indices of output layers. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable # Build inputs input_token = keras.layers.Input(shape=(seq_len, ), name='Input-Token') input_segment = keras.layers.Input(shape=(seq_len, ), name='Input-Segment') inputs = [input_token, input_segment] # Build embeddings embed_token, embed_weights, embed_projection = AdaptiveEmbedding( input_dim=token_num, output_dim=hidden_dim, embed_dim=embed_dim, mask_zero=True, trainable=trainable, return_embeddings=True, return_projections=True, name='Embed-Token', )(input_token) embed_segment = keras.layers.Embedding( input_dim=2, output_dim=hidden_dim, trainable=trainable, name='Embed-Segment', )(input_segment) embed_layer = keras.layers.Add(name='Embed-Token-Segment')( [embed_token, embed_segment]) embed_layer = PositionEmbedding( input_dim=pos_num, output_dim=hidden_dim, mode=PositionEmbedding.MODE_ADD, trainable=trainable, name='Embedding-Position', )(embed_layer) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) # Build shared transformer attention_layer = MultiHeadAttention( head_num=head_num, activation=attention_activation, name='Attention', ) attention_normal = LayerNormalization(name='Attention-Normal') feed_forward_layer = FeedForward(units=feed_forward_dim, activation=feed_forward_activation, name='Feed-Forward') feed_forward_normal = LayerNormalization(name='Feed-Forward-Normal') transformed = embed_layer transformed_layers = [] for i in range(transformer_num): attention_input = transformed transformed = attention_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Attention-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Attention-Add-{}'.format(i + 1), )( [attention_input, transformed]) transformed = attention_normal(transformed) feed_forward_input = transformed transformed = feed_forward_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Feed-Forward-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Feed-Forward-Add-{}'.format(i + 1), )( [feed_forward_input, transformed]) transformed = feed_forward_normal(transformed) transformed_layers.append(transformed) if training: # Build tasks mlm_dense_layer = keras.layers.Dense( units=hidden_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = AdaptiveSoftmax( input_dim=hidden_dim, output_dim=token_num, embed_dim=embed_dim, bind_embeddings=True, bind_projections=True, name='MLM-Sim', )([mlm_norm_layer, embed_weights, embed_projection]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=hidden_dim, activation='tanh', name='SOP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='SOP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) return model if output_layers is not None: if isinstance(output_layers, list): output_layers = [ transformed_layers[index] for index in output_layers ] output = keras.layers.Concatenate(name='Output', )(output_layers) else: output = transformed_layers[output_layers] model = keras.models.Model(inputs=inputs, outputs=output) return model model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) return inputs, transformed
def get_model(token_num, pos_num=512, seq_len=512, embed_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layer_num=1, use_task_embed=False, task_num=10, use_adapter=False, adapter_units=None): """Get BERT model. See: https://arxiv.org/pdf/1810.04805.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `training` is `False`. :param use_task_embed: Whether to add task embeddings to existed embeddings. :param task_num: The number of tasks. :param use_adapter: Whether to use feed-forward adapters before each residual connections. :param adapter_units: The dimension of the first transformation in feed-forward adapter. :return: The built model. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training if adapter_units is None: adapter_units = max(1, embed_dim // 100) def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable inputs = get_inputs(seq_len=seq_len) x, s, m = inputs x = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]), name='Input-Token-Reshape')(x) s = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]), name='Input-Segment-Reshape')(s) m = keras.layers.Lambda(lambda x: keras.backend.reshape(x, [-1, pos_num]), name='Input-Mention-Reshape')(m) embed_layer, embed_weights = get_embedding( [x, s, m], token_num=token_num, embed_dim=embed_dim, pos_num=pos_num, dropout_rate=dropout_rate, ) if use_task_embed: task_input = keras.layers.Input( shape=(1, ), name='Input-Task', ) embed_layer = TaskEmbedding( input_dim=task_num, output_dim=embed_dim, mask_zero=False, name='Embedding-Task', )([embed_layer, task_input]) inputs = inputs[:2] + [task_input, inputs[-1]] if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) transformed = get_encoders( encoder_num=transformer_num, input_layer=embed_layer, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, use_adapter=use_adapter, adapter_units=adapter_units, adapter_activation=gelu, ) if training: mlm_dense_layer = keras.layers.Dense( units=embed_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = EmbeddingSimilarity(name='MLM-Sim')( [mlm_norm_layer, embed_weights]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=embed_dim, activation='tanh', name='NSP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='NSP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) return model else: model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) if isinstance(output_layer_num, int): output_layer_num = min(output_layer_num, transformer_num) output_layer_num = [-i for i in range(1, output_layer_num + 1)] outputs = [] for layer_index in output_layer_num: if layer_index < 0: layer_index = transformer_num + layer_index layer_index += 1 layer = model.get_layer( name='Encoder-{}-FeedForward-Norm'.format(layer_index)) outputs.append(layer.output) if len(outputs) > 1: transformed = keras.layers.Concatenate(name='Encoder-Output')(list( reversed(outputs))) else: transformed = outputs[0] return inputs, transformed
def bert_indoorlocation_train_with_label(): config = tf.ConfigProto(allow_soft_placement=True) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) config.gpu_options.allow_growth = True # 准备训练集数据和验证集数据 word2id_dict, id2word_dict = utils.get_word_id_map(word_id_map_file_path) x_train, y_train, reference_tags_train = utils.gen_fine_tune_bert_data( train_datafile_path, seq_len) x_valid, y_valid, reference_tags_valid = utils.gen_fine_tune_bert_data( valid_datafile_path, seq_len) # x_test, y_test, reference_tags_test = utils.gen_fine_tune_bert_data(test_datafile_name, seq_len) # x_train, y_train = np.array(x_train), np.array(y_train) # x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], x_train.shape[2], 1)) # y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], 1)) # with tf.Session(config=config) as sess: # model = load_trained_model_from_checkpoint( # config_path, # checkpoint_path, # training=False, ###!!!十分重要!!!! # trainable=True, # seq_len=seqence_len, # ) # 初始化模型和参数 # mymodel, myconfig = build_model_from_config( # config_path, # training=False, # trainable=True) input_layer, transformed = keras_bert.my_get_model( token_num=len(word2id_dict), head_num=hp.head_num, transformer_num=hp.transformer_num, embed_dim=hp.embed_dim, feed_forward_dim=hp.feed_forward_dim, dropout_rate=hp.dropout_rate, seq_len=hp.seq_len, pos_num=hp.seq_len, attention_activation='gelu', training=False, ### !!!!!!!一定不能忘记设置为False!!!!!!!!! trainable=True) # output_layer = model.inputs[:2] # dense = model.get_layer('Encoder-2-FeedForward-Norm').output # output_layer = keras.layers.Dense(units=2, activation='relu')(dense) extract_layer = Extract(index=0, name='Extract')(transformed) # coor_dense = keras.layers.Dense(units=embed_dim, activation="relu", name="coor_dense")(transformed) output_layer = keras.layers.Dense(units=2, activation="relu", name="coor_output")(extract_layer) model = keras.models.Model(inputs=input_layer, outputs=output_layer) if flag_retrain or only_evaluate_history_model_flag: model.load_weights(trained_model_path) else: model.load_weights(pretrained_model_path, by_name=True) model.summary() if not only_evaluate_history_model_flag: optimizer = keras.optimizers.RMSprop(LR) model.compile( optimizer=optimizer, loss='mse', metrics=['mae', 'mse'], ) early_stopping = keras.callbacks.EarlyStopping(monitor="loss", patience=5) # model.fit( # x_train, # y_train, # validation_data=(x_valid, y_valid), # epochs=EPOCHS, # batch_size=BATCH_SIZE, # callbacks=[early_stopping] # ) model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping]) model.save(trained_model_path) # predicts = model.predict(x_train) # labels = y_train # reference_tags = reference_tags_train # evaluate_fine_tune_model(predicts, labels, reference_tags) # predicts = model.predict(x_test) # labels = y_test # reference_tags = reference_tags_test # utils.evaluate_fine_tune_model(predicts, labels, reference_tags) utils.evaluate_fine_tune_model(model, test_datafile_path) else: # predicts = model.predict(x_test) # labels = y_test # reference_tags = reference_tags_test # utils.evaluate_fine_tune_model(predicts, labels, reference_tags) utils.evaluate_fine_tune_model(model, test_datafile_path)
config_path=paths.config, checkpoint_path=paths.model, batch_size=BATCH_SIZE, memory_len=0, target_len=SEQ_LEN, in_train_phase=False, attention_type=ATTENTION_TYPE_BI, ) #### 加载预训练权重 # Build classification model last = model.output extract = Extract(index=-1, name='Extract')(last) dense = keras.layers.Dense(units=768, name='Dense')(extract) norm = keras.layers.BatchNormalization(name='Normal')(dense) output = keras.layers.Dense(units=11, activation='softmax', name='Softmax')(norm) model = keras.models.Model(inputs=model.inputs, outputs=output) model.summary() # 定义优化器,loss和metrics model.compile(
def get_checkpoint_model(token_num, pos_num=512, seq_len=512, embed_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, finetuned=False, output_dim=2, trainable=None, output_layer_num=1, retention_configuration=None, LAMBDA=None, FLAG_EXTRACT_LAYER=None, TASK=None, ): """Get BERT model. :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param trainable: Whether the model is trainable. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `training` is `False`. :return: The built model. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable inputs = get_inputs(seq_len=seq_len) attention_mask = inputs[2] embed_layer, embed_weights = get_embedding( inputs, token_num=token_num, embed_dim=embed_dim, pos_num=pos_num, dropout_rate=dropout_rate, ) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) transformed = get_encoders( encoder_num=transformer_num, input_layer=embed_layer, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, attention_mask=attention_mask, SEQ_LEN=seq_len, retention_configuration=retention_configuration, LAMBDA=LAMBDA, FLAG_EXTRACT_LAYER=FLAG_EXTRACT_LAYER, ) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=embed_dim, activation='tanh', name='NSP-Dense', )(extract_layer) if TASK == 'sts-b': nsp_pred_layer = keras.layers.Dense( units=output_dim, name='NSP', )(nsp_dense_layer) else: nsp_pred_layer = keras.layers.Dense( units=output_dim, activation='softmax', name='NSP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=nsp_pred_layer) for layer in model.layers: layer.trainable = _trainable(layer) return model