def test_google_weights(self): albert_model_name = "albert_base" albert_dir = bert.fetch_tfhub_albert_model(albert_model_name, ".models") albert_params = bert.albert_params(albert_model_name) l_bert = bert.BertModelLayer.from_params(albert_params, name="albert") l_input_ids = keras.layers.Input(shape=(128, ), dtype='int32', name="input_ids") l_token_type_ids = keras.layers.Input(shape=(128, ), dtype='int32', name="token_type_ids") output = l_bert([l_input_ids, l_token_type_ids]) output = keras.layers.Lambda(lambda x: x[:, 0, :])(output) output = keras.layers.Dense(2)(output) model = keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output) model.build(input_shape=(None, 128)) model.compile( optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]) for weight in l_bert.weights: print(weight.name) bert.load_albert_weights(l_bert, albert_dir) model.summary()
def test_albert_load_base_google_weights(self): # for coverage mainly albert_model_name = "albert_base" albert_dir = bert.fetch_tfhub_albert_model(albert_model_name, ".models") model_params = bert.albert_params(albert_model_name) l_bert = bert.BertModelLayer.from_params(model_params, name="albert") model = keras.models.Sequential([ keras.layers.InputLayer(input_shape=(8, ), dtype=tf.int32, name="input_ids"), l_bert, keras.layers.Lambda(lambda x: x[:, 0, :]), keras.layers.Dense(2), ]) model.build(input_shape=(None, 8)) model.compile( optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]) bert.load_albert_weights(l_bert, albert_dir) model.summary()
def create_model(l_bert, model_ckpt, max_seq_len, num_labels, label_threshold_less, model_type): """ Wrapper function to return keras learning rate scheduler callback Args: l_bert (bert.model.BertModelLayer): BERT layer model_ckpt (str): path to best model checkpoint max_seq_length (int): maximum sequence length for training data num_labels (int): final output dimensionality per token label_threshold_less (int): all label IDs strictly less than this number will be ignored in class accuracy calculations model_type (str): type of model decoder to use, see './utils/model_utils.py' Returns: model (tensorflow.python.keras.engine.training.Model): final compiled model which can be used for fine-tuning """ input_ids = Input(shape=(max_seq_len, ), dtype='int32') output = l_bert(input_ids) if model_type == "TD_Dense": output = TimeDistributed(Dense(512))(output) output = Activation("relu")(output) output = TimeDistributed(Dense(256))(output) output = Activation("relu")(output) output = TimeDistributed(Dense(128))(output) output = Activation("relu")(output) output = TimeDistributed(Dense(64))(output) output = Activation("relu")(output) output = TimeDistributed(Dense(num_labels))(output) elif model_type == "1D_CNN": output = Conv1D(512, 3, padding="same")(output) output = Activation("relu")(output) output = Conv1D(256, 3, padding="same")(output) output = Activation("relu")(output) output = Conv1D(128, 3, padding="same")(output) output = Activation("relu")(output) output = Conv1D(64, 3, padding="same")(output) output = Activation("relu")(output) output = Conv1D(num_labels, 3, padding="same")(output) elif model_type == "Stacked_LSTM": output = LSTM(512, return_sequences=True)(output) output = LSTM(256, return_sequences=True)(output) output = LSTM(128, return_sequences=True)(output) output = TimeDistributed(Dense(64))(output) output = Activation("relu")(output) output = TimeDistributed(Dense(num_labels))(output) prob = Activation("softmax")(output) model = tf.keras.Model(inputs=input_ids, outputs=prob) model.build(input_shape=(None, max_seq_len)) bert.load_albert_weights(l_bert, model_ckpt) model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=[class_acc(label_threshold_less)]) model.summary() return model
def create_model( model_dir, model_type, max_seq_len, n_classes, load_pretrained_weights=True, summary=False, ): """Creates keras model with pretrained BERT/ALBERT layer. Args: model_dir: String. Path to model. model_type: String. Expects either "albert" or "bert" max_seq_len: Int. Maximum length of a classificaton example. n_classes: Int. Number of training classes. load_pretrained_weights: Boolean. Load pretrained model weights. summary: Boolean. Print model summary. Returns: Keras model """ if model_type == "albert": model_ckpt = os.path.join(model_dir, "model.ckpt-best") model_params = bert.albert_params(model_dir) elif model_type == "bert": model_ckpt = os.path.join(model_dir, "bert_model.ckpt") model_params = bert.params_from_pretrained_ckpt(model_dir) layer_bert = bert.BertModelLayer.from_params(model_params, name=model_type) input_ids = keras.layers.Input(shape=(max_seq_len,), dtype="int32", name="input_ids") output = layer_bert(input_ids) cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output) cls_out = keras.layers.Dropout(0.5)(cls_out) logits = keras.layers.Dense(units=model_params["hidden_size"], activation="relu")(cls_out) logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(units=n_classes, activation="softmax")(logits) model = keras.Model(inputs=input_ids, outputs=logits) model.build(input_shape=(None, max_seq_len)) if load_pretrained_weights: if model_type == "albert": bert.load_albert_weights(layer_bert, model_ckpt) elif model_type == "bert": bert.load_bert_weights(layer_bert, model_ckpt) model.compile( optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")], ) if summary: model.summary() return model
def test_albert_params(self): albert_model_name = "albert_base" albert_dir = bert.fetch_tfhub_albert_model(albert_model_name, ".models") dir_params = bert.albert_params(albert_dir) dir_params.attention_dropout = 0.1 # diff between README and assets/albert_config.json dir_params.hidden_dropout = 0.1 name_params = bert.albert_params(albert_model_name) self.assertEqual(name_params, dir_params) # coverage model_params = dir_params model_params.vocab_size = model_params.vocab_size + 2 model_params.adapter_size = 1 l_bert = bert.BertModelLayer.from_params(model_params, name="albert") l_bert(tf.zeros((1, 128))) bert.load_albert_weights(l_bert, albert_dir)
def test_albert_google_weights(self): albert_model_name = "albert_base" albert_dir = bert.fetch_tfhub_albert_model(albert_model_name, ".models") albert_params = bert.albert_params(albert_model_name) model, l_bert = self.build_model(albert_params) skipped_weight_value_tuples = bert.load_albert_weights(l_bert, albert_dir) self.assertEqual(0, len(skipped_weight_value_tuples)) model.summary()
def Albert_model(max_seq_len): model_name = "albert_large" model_dir = bert.fetch_tfhub_albert_model(model_name, ".models") model_params = bert.albert_params(model_name) model_params.shared_layer = True model_params.embedding_size = 1024 l_bert = bert.BertModelLayer.from_params(model_params, name="albert") l_input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32') # using the default token_type/segment id 0 output = l_bert(l_input_ids) # output: [batch_size, max_seq_len, hidden_size] output = keras.layers.GlobalAveragePooling1D()(output) model = keras.Model(inputs=l_input_ids, outputs=output) model.build(input_shape=(None, max_seq_len)) # use in a Keras Model here, and call model.build() bert.load_albert_weights(l_bert, model_dir) # should be called after model.build() return model, model_dir
def test_albert_google_weights_non_tfhub(self): albert_model_name = "albert_base_v2" albert_dir = bert.fetch_google_albert_model(albert_model_name, ".models") model_ckpt = os.path.join(albert_dir, "model.ckpt-best") albert_params = bert.albert_params(albert_dir) model, l_bert = self.build_model(albert_params) skipped_weight_value_tuples = bert.load_albert_weights(l_bert, model_ckpt) self.assertEqual(0, len(skipped_weight_value_tuples)) model.summary()
def test_albert_chinese_weights(self): albert_model_name = "albert_base" albert_dir = bert.fetch_brightmart_albert_model(albert_model_name, ".models") albert_ckpt = os.path.join(albert_dir, "albert_model.ckpt") albert_params = bert.params_from_pretrained_ckpt(albert_dir) model, l_bert = self.build_model(albert_params) skipped_weight_value_tuples = bert.load_albert_weights(l_bert, albert_ckpt) self.assertEqual(0, len(skipped_weight_value_tuples)) model.summary()
def test_albert_zh_fetch_and_load(self): albert_model_name = "albert_tiny" albert_dir = bert.fetch_brightmart_albert_model( albert_model_name, ".models") model_params = bert.params_from_pretrained_ckpt(albert_dir) model_params.vocab_size = model_params.vocab_size + 2 model_params.adapter_size = 1 l_bert = bert.BertModelLayer.from_params(model_params, name="albert") l_bert(tf.zeros((1, 128))) res = bert.load_albert_weights(l_bert, albert_dir) self.assertTrue(len(res) > 0)
def test_chinese_weights(self): #bert_ckpt_dir = ".models/albert_base_zh/" #bert_ckpt_file = bert_ckpt_dir + "albert_model.ckpt" #bert_config_file = bert_ckpt_dir + "albert_config_base.json" print("Eager Execution:", tf.executing_eagerly()) albert_model_name = "albert_base" albert_dir = bert.fetch_brightmart_albert_model( albert_model_name, ".models") albert_ckpt = os.path.join(albert_dir, "albert_model.ckpt") bert_params = bert.params_from_pretrained_ckpt(albert_dir) l_bert = bert.BertModelLayer.from_params(bert_params, name="bert") l_input_ids = keras.layers.Input(shape=(128, ), dtype='int32', name="input_ids") l_token_type_ids = keras.layers.Input(shape=(128, ), dtype='int32', name="token_type_ids") output = l_bert([l_input_ids, l_token_type_ids]) output = keras.layers.Lambda(lambda x: x[:, 0, :])(output) output = keras.layers.Dense(2)(output) model = keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output) model.build(input_shape=(None, 128)) model.compile( optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]) for weight in l_bert.weights: print(weight.name) bert.load_albert_weights(l_bert, albert_ckpt) model.summary()
def load_bert_model(name_model, max_seq_len, trainable=False): """ models name supported, same as tf-2.0-bert """ model_name = name_model model_dir = bert.fetch_tfhub_albert_model(model_name, ".models") model_params = bert.albert_params(model_name) l_bert = bert.BertModelLayer.from_params(model_params, name=name_model) l_input_ids = tf.keras.layers.Input(shape=(max_seq_len, ), dtype='int32') output = l_bert( l_input_ids) # output: [batch_size, max_seq_len, hidden_size] model = tf.keras.Model(inputs=l_input_ids, outputs=output) model.build(input_shape=(None, max_seq_len)) # load google albert original weights after the build bert.load_albert_weights(l_bert, model_dir) model.trainable = trainable return model
def build_transformer(transformer, max_seq_length=None, num_labels=None, tagging=True, tokenizer_only=False): spm_model_file = None if transformer in zh_albert_models_google: from bert.tokenization.albert_tokenization import FullTokenizer model_url = zh_albert_models_google[transformer] albert = True elif transformer in albert_models_tfhub: from edparser.layers.transformers.albert_tokenization import FullTokenizer with stdout_redirected(to=os.devnull): model_url = fetch_tfhub_albert_model(transformer, os.path.join(hanlp_home(), 'thirdparty', 'tfhub.dev', 'google', transformer)) albert = True spm_model_file = glob.glob(os.path.join(model_url, 'assets', '*.model')) assert len(spm_model_file) == 1, 'No vocab found or unambiguous vocabs found' spm_model_file = spm_model_file[0] elif transformer in bert_models_google: from bert.tokenization.bert_tokenization import FullTokenizer model_url = bert_models_google[transformer] albert = False else: raise ValueError( f'Unknown model {transformer}, available ones: {list(bert_models_google.keys()) + list(zh_albert_models_google.keys()) + list(albert_models_tfhub.keys())}') bert_dir = get_resource(model_url) if spm_model_file: vocab = glob.glob(os.path.join(bert_dir, 'assets', '*.vocab')) else: vocab = glob.glob(os.path.join(bert_dir, '*vocab*.txt')) assert len(vocab) == 1, 'No vocab found or unambiguous vocabs found' vocab = vocab[0] lower_case = any(key in transformer for key in ['uncased', 'multilingual', 'chinese', 'albert']) if spm_model_file: # noinspection PyTypeChecker tokenizer = FullTokenizer(vocab_file=vocab, spm_model_file=spm_model_file, do_lower_case=lower_case) else: tokenizer = FullTokenizer(vocab_file=vocab, do_lower_case=lower_case) if tokenizer_only: return tokenizer if spm_model_file: bert_params = albert_params(bert_dir) else: bert_params = bert.params_from_pretrained_ckpt(bert_dir) l_bert = bert.BertModelLayer.from_params(bert_params, name='albert' if albert else "bert") if not max_seq_length: return l_bert, tokenizer, bert_dir l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids") l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids") l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids") output = l_bert([l_input_ids, l_token_type_ids], mask=l_mask_ids) if not tagging: output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output) if bert_params.hidden_dropout: output = tf.keras.layers.Dropout(bert_params.hidden_dropout, name='hidden_dropout')(output) logits = tf.keras.layers.Dense(num_labels, kernel_initializer=tf.keras.initializers.TruncatedNormal( bert_params.initializer_range))(output) model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits) model.build(input_shape=(None, max_seq_length)) if not spm_model_file: ckpt = glob.glob(os.path.join(bert_dir, '*.index')) assert ckpt, f'No checkpoint found under {bert_dir}' ckpt, _ = os.path.splitext(ckpt[0]) with stdout_redirected(to=os.devnull): if albert: if spm_model_file: skipped_weight_value_tuples = bert.load_albert_weights(l_bert, bert_dir) else: # noinspection PyUnboundLocalVariable skipped_weight_value_tuples = load_stock_weights(l_bert, ckpt) else: # noinspection PyUnboundLocalVariable skipped_weight_value_tuples = bert.load_bert_weights(l_bert, ckpt) assert 0 == len(skipped_weight_value_tuples), f'failed to load pretrained {transformer}' return model, tokenizer
def load_pretrained(self, pretrained_ckpt): self.build(input_shape=(None, 2, self._max_seq_len)) bert.load_albert_weights(self.l_bert, pretrained_ckpt)
def load_pretrained(self, pretrained_ckpt): # self.call(np.zeros((16, 2, self.max_seq_len))) self.build(input_shape=(None, 2, self._max_seq_len)) bert.load_albert_weights(self.l_bert, pretrained_ckpt)
# TODO: Try with more regularisation # cls_out = keras.layers.Dropout(0.5)(cls_out) logits = keras.layers.Dense(units=256, activation='relu')(cls_out) logits = keras.layers.Dropout(0.5)(logits) # NOTE: Alternative to the Lambda layer # bgru_layer = keras.layers.Bidirectional(keras.layers.GRU(64))(output) output = keras.layers.Dense(units=1, activation='sigmoid')(logits) model = keras.Model(inputs=input_ids, outputs=output) # Freeze all non-trainable layers freeze_layers(bert_layer, exclude=['LayerNorm']) # Originally from tutorial: ['LayerNorm', 'adapter-down', 'adapter-up'] # Build model and load pre-trained weights model.build(input_shape=(None, MAX_SEQ_LEN)) bert.load_albert_weights(bert_layer, MODEL_DIR) model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_score], ) model.summary() # Alternative for loading weights from checkpoint file # from bert.loader import (StockBertConfig, map_stock_config_to_params, # load_stock_weights) # bert_ckpt_dir="gs://bert_models/2018_10_18/uncased_L-12_H-768_A-12/"