def load_model(num_labels, trainable=True): # Use BertForTokenClassification model = TFBertForTokenClassification.from_pretrained( "bert-base-multilingual-cased", num_labels=num_labels) model.trainable = trainable return model
def build_model(self, max_length, train_batch_size, learning_rate, epochs, num_labels, tagset=None, gpu_growth=True, eval_batch_size=32): #if gpu_growth: # model_utils.set_tf_memory_growth() if self.task == "pos": self.model = TFBertForTokenClassification.from_pretrained( self.model_name, num_labels=num_labels, from_pt=True) self.tokenizer = MBERT_Tokenizer_pos.from_pretrained( self.model_name, do_lower_case=False) else: self.model = TFBertForSequenceClassification.from_pretrained( self.model_name, num_labels=num_labels, from_pt=True) self.tokenizer = BertTokenizer.from_pretrained(self.model_name, do_lower_case=False) #self.model, self.tokenizer = model_utils.create_model(self.short_model_name, self.task, num_labels) self.model = model_utils.compile_model(self.model, self.task, learning_rate) print("Successfully built", self.model_name) self.max_length = max_length self.train_batch_size = train_batch_size self.learning_rate = learning_rate self.epochs = epochs self.num_labels = num_labels if tagset: self.tagset = tagset self.label_map = {label: i for i, label in enumerate(tagset)} self.eval_batch_size = eval_batch_size
def build_model(pretrained_model_name_or_path, num_labels): config = BertConfig.from_pertrained(pretrained_model_name_or_path, num_labels=num_labels) model = TFBertForTokenClassification.from_pretrained( pretrained_model_name_or_path, config=config) model.layers[-1].activation = tf.keras.activations.softmax return model
def get_model(): config = BertConfig.from_pretrained('bert-base-multilingual-cased', num_labels=3) model = TFBertForTokenClassification.from_pretrained( "bert-base-multilingual-cased", config=config) model.layers[-1].activation = tf.keras.activations.softmax print(model.summary()) return model
def __init__(self): super(MyModel, self).__init__() self.bert = TFBertForTokenClassification.from_pretrained( 'bert-base-chinese', return_dict=True, num_labels=7) # TFBertForTokenClassification by yourself # self.bert = TFBertModel.from_pretrained('bert-base-chinese') # self.dropout = tf.keras.layers.Dropout(0.1) # self.classifier = tf.keras.layers.Dense(7, name="classifier") pass
def getBertModel(): config.num_labels = 2 bertModel = TFBertForTokenClassification.from_pretrained(PRETRAINED_MODEL, config=config) optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) bertModel.compile(optimizer=optimizer, loss=loss) return bertModel
def test_TFBertForTokenClassification(self): from transformers import BertTokenizer, TFBertForTokenClassification pretrained_weights = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFBertForTokenClassification.from_pretrained( pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def load_saved_model_test(eval_batch_size=32, model_path="96_64", file_path=os.path.join(c.PROCESSED_DATASET_DIR, c.TEST_FILE)): """Create Features & Tokenize""" tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True) trained_model = TFBertForTokenClassification.from_pretrained(model_path) optimizer = tf.keras.optimizers.Adam() metrics = [ keras.metrics.SparseCategoricalAccuracy('micro_f1/cat_accuracy', dtype=tf.float32), macro_f1 ] loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) logging.info("Compiling Model ...") trained_model.compile(optimizer=optimizer, loss=loss, metrics=metrics, run_eagerly=True) test_data, test_inputs, test_labels = extract_features.retrieve_pred_features( file_path, c.LABELS, c.MAX_SEQ_LENGTH, tokenizer, c.LABEL_ID_PKL_FILE) # # Test Scores # test_loss, test_acc, test_f1_macro = trained_model.evaluate(test_data, batch_size=eval_batch_size) # logging.info(str({"Loss": test_loss, "Micro F1/Accuracy": test_acc, "Macro F1": test_f1_macro})) # evaluate model with sklearn predictions = np.argmax(trained_model.predict(test_data, batch_size=eval_batch_size, verbose=1).logits, axis=-1) print(np.shape(predictions), np.shape(test_labels)) sk_report, macro_f1_score, micro_f1_score, macro_recall_score, macro_precision_score = calculate_pred_metrics( test_labels, predictions) print('\n', sk_report) logging.info(sk_report) logging.info("****TEST METRICS****") metrics_dict = { "Macro_F1": macro_f1_score, "Micro_F1": micro_f1_score, "Macro_Precision": macro_precision_score, "Macro_Recall": macro_recall_score } logging.info(str(metrics_dict)) return f'bert_eval_{macro_f1_score}_{uuid.uuid4()}'
def load(name): try: with open(f'./app/main/models/utils/{name}_idx2tag.pickle', 'rb') as handle: idx2tag = pickle.load(handle) except FileNotFoundError: idx2tag = None if name == 'ner': model = TFBertForTokenClassification.from_pretrained( 'bert-base-uncased', num_labels=len(idx2tag) ) with open(f'./app/main/models/weights/{name}.pickle', 'rb') as handle: model.set_weights(pickle.load(handle)) with open(f'./app/main/models/tokenizers/{name}.pickle', 'rb') as handle: tokenizer = pickle.load(handle) return model, tokenizer, idx2tag
import tensorflow as tf import transformers from transformers import TFBertForTokenClassification, TFXLMRobertaForTokenClassification if __name__ == "__main__": model = TFBertForTokenClassification.from_pretrained( "../norbert3/model.ckpt-1060000.data-00000-of-00001", from_tf=True)
con1 = (pred_flat == 1) con2 = (labels_flat == 1) part = np.where(con1 & con2) correct = len(part[0]) sum_pred = np.sum(pred_flat) sum_true = np.sum(labels_flat) return correct, sum_pred, sum_true def get(self): return self.reports # model configuration bertModel = TFBertForTokenClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2) optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) # labels as integer # loss = tf.keras.losses.BinaryCrossentropy(from_logits=True) # only two labels (one-hot) # loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True) # two or more one-hot encoding # metric = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()] eval_metrics = EvaluateModel(train_x, tr_tags, val_x, val_tags) # bertModel.compile(optimizer=optimizer, loss=loss, metrics=metric) bertModel.compile(optimizer=optimizer, loss=loss) # training # bertModel.fit(x=train_x, y=train_y, epochs=EPOCHS, callbacks=[eval_metrics])
config.num_labels = 2 # maxTokenLen, sent_tokens, sent_token_ids, sent_token_spans, sent_token_tags = \ # alignSpansBySentence(train_sentences, train_spans) # input_ids, input_masks, input_tags, token_spans = \ # chunkData(maxTokenLen, sent_tokens, sent_token_ids, sent_token_spans, sent_token_tags) train_x, train_y = buildData(train_sentences, train_spans) train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y)) print('train data loaded:({0})'.format(len(train_y))) val_x, val_y = buildData(val_sentences, val_spans) val_dataset = tf.data.Dataset.from_tensor_slices((val_x, val_y)) print('validation data loaded:({0})'.format(len(val_y))) model = TFBertForTokenClassification.from_pretrained('bert-base-uncased', config=config) model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) model.fit(train_dataset.shuffle(len(train_y)).batch(BATCH_SIZE), validation_data=val_dataset.batch(BATCH_SIZE), epochs=EPOCHS, batch_size=BATCH_SIZE) model.save_weights('/content/drive/MyDrive/ncg/model-SI-BERT/') print('*****************model saved!******************')
def train_test(epochs, eval_batch_size, epsilon=1e-7, init_lr=2e-5, beta_1=0.9, beta_2=0.999): mlflow.log_params({ "epochs": epochs, "eval_batch_size": eval_batch_size, "epsilon": epsilon, "init_lr": init_lr, "beta_1": beta_1, "beta_2": beta_2 }) """Create Features & Tokenize""" tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True) train_data = extract_features.retrieve_features(c.TRAIN_FILE, c.LABELS, c.MAX_SEQ_LENGTH, tokenizer, c.LABEL_ID_PKL_FILE) config = BertConfig.from_pretrained('bert-base-multilingual-cased', num_labels=len(c.LABELS)) model = TFBertForTokenClassification.from_pretrained( "bert-base-multilingual-cased", config=config) model.summary() model.layers[-1].activation = tf.keras.activations.softmax optimizer = tf.keras.optimizers.Adam(learning_rate=init_lr, epsilon=epsilon, beta_1=beta_1, beta_2=beta_2) metrics = [ keras.metrics.SparseCategoricalAccuracy('micro_f1/cat_accuracy', dtype=tf.float32), macro_f1 ] loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) logging.info("Compiling Model ...") model.compile(optimizer=optimizer, loss=loss, metrics=metrics, run_eagerly=True) logging.info("Model has been compiled") val_data, val_inputs, val_labels = extract_features.retrieve_features( c.VALIDATION_FILE, c.LABELS, c.MAX_SEQ_LENGTH, tokenizer, c.LABEL_ID_PKL_FILE) test_data, test_inputs, test_labels = extract_features.retrieve_features( c.TEST_FILE, c.LABELS, c.MAX_SEQ_LENGTH, tokenizer, c.LABEL_ID_PKL_FILE) logging.info("Test Validation features are ready") f1_metric = EvalMetrics(val_data, val_labels, eval_batch_size) model.fit(train_data, epochs=epochs, validation_data=val_data, callbacks=[f1_metric]) logging.info("Model Fitting is done") # Save Model save_dir_path = os.path.join(c.FINAL_OUTPUT_DIR, "model_" + str(time.time())) os.mkdir(save_dir_path) # tf.saved_model.save(model, export_dir=save_dir_path) model.save_pretrained(save_dir_path, saved_model=True) logging.info("Model Saved at: {}".format(save_dir_path)) # Test Scores test_loss, test_acc, test_f1_macro = model.evaluate( test_data, batch_size=eval_batch_size) logging.info( str({ "Loss": test_loss, "Micro F1/Accuracy": test_acc, "Macro F1": test_f1_macro })) # evaluate model with sklearn predictions = np.argmax(model.predict(test_data, batch_size=eval_batch_size, verbose=1).logits, axis=-1) print(np.shape(predictions), np.shape(test_labels)) sk_report, macro_f1_score, micro_f1_score, macro_recall_score, macro_precision_score = calculate_pred_metrics( test_labels, predictions) print('\n', sk_report) logging.info(sk_report) logging.info("****TEST METRICS****") metrics_dict = { "Loss": test_loss, "CatAcc": test_acc, "Macro_F1": macro_f1_score, "Micro_F1": micro_f1_score, "Macro_Precision": macro_precision_score, "Macro_Recall": macro_recall_score } logging.info(str(metrics_dict)) mlflow.log_metrics(metrics_dict) return save_dir_path, [ f'epochs:{epochs}', f'eval_batch_size: {eval_batch_size}', f'epsilon: {epsilon}', f'init_lr: {init_lr}', f'beta_1: {beta_1}', f'beta_2: {beta_2}' ], f'bert_{test_acc}_{macro_f1_score}_{uuid.uuid4()}'
import tensorflow as tf import numpy as np from transformers import BertTokenizer, TFBertForTokenClassification from model.model import Model ''' parameters ''' model_name = 'bert-base-cased' num_labels = 2 max_length = 64 weights_path = 'model/saved_weights/weights.0.21.h5' ''' load model ''' tokenizer = BertTokenizer.from_pretrained(model_name) model = Model( TFBertForTokenClassification.from_pretrained(model_name, num_labels=num_labels) ) # need to optimize this step by loading config instead of weights model(tf.zeros([1, 3, max_length], tf.int32)) model.load_weights(weights_path) model.compile(run_eagerly=True) ''' score passages ''' TEXT = "The origin of the name Moabit is disputed. According to one account, \ it can be traced back to the Huguenots, in the time of King Frederick William I of Prussia. \ These French refugees are said to have named their new residence in reference \ to the Biblical description of the Israelites in the country of Moab, where they \ stayed before being allowed to enter Canaan. Other possible origins include \ the German (Berlin dialect) \"Moorjebiet\" (swamp area). " inputs_ = tokenizer.encode_plus(text=TEXT, max_length=max_length, pad_to_max_length=True, return_token_type_ids=True, return_attention_mask=True)
def __init__(self, model_path, num_tags, dropout): self.model_path = model_path self.num_tags = num_tags self.dropout = dropout self.encoder = TFBertForTokenClassification.from_pretrained( self.model_path)
from transformers import BertTokenizer, TFBertForTokenClassification, BertForTokenClassification import tensorflow as tf import numpy as np import glob import loadData tf.random.set_seed(2019) np.random.seed(2019) MAX_TOKEN = 256 PRETRAINED_MODEL = 'bert-base-uncased' val_path = './SemEval18_Task12/Training/Validation_Data_Codalab/detection' tf_model = TFBertForTokenClassification.from_pretrained('./save/') tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL) token_inputs = tf.keras.Input(shape=(None, ), name='input_ids', dtype='int32') mask_inputs = tf.keras.Input(shape=(None, ), name='attention_mask', dtype='int32') segment_inputs = tf.keras.Input(shape=(None, ), name='token_type_ids', dtype='int32') annFiles = glob.glob(val_path + '/*.ann') with open('./result/result.html', 'w') as out: for annFile in annFiles: txtFile = annFile.replace('.ann', '.txt') lastIdx = 0 print(txtFile)
def main(): args = model_params() tag2id_path = os.path.join(args["output_path"], args["tag2id"]) if not os.path.exists(args["output_path"]): os.makedirs(args["output_path"]) if not os.path.join(args["pb_path"]): os.makedirs(args["pb_path"]) max_len = args["max_len"] batch_size = args["batch_size"] epoch = args["epoch"] # load data train_data, train_label_ori, tag2id, train_len = load_data(args["train_file"]) print("train data size: ", len(train_data)) print("train label size: ", len(train_label_ori)) print("label dict: ", tag2id) dev_data, dev_label_ori, _, dev_len = load_data(args["dev_file"]) print("dev data size: ", len(dev_data)) print("dev label size: ", len(dev_label_ori)) # save tag2id save_dict(tag2id, tag2id_path) # label encoder train_label = label_encoder(train_label_ori, tag2id) print("train label: ", train_label[:3]) dev_label = label_encoder(dev_label_ori, tag2id) print("dev label: ", dev_label[:3]) # get tokenizer tokenizer = get_tokenizer(args["pretrain_model_path"]) # tokenizer = get_roberta_tokenizer() # 准备模型数据 train_x, train_y = create_inputs_targets_roberta(train_data, train_label, tag2id, max_len, tokenizer) dev_x, dev_y = create_inputs_targets_roberta(dev_data, dev_label, tag2id, max_len, tokenizer) # create model bert model = TFBertForTokenClassification.from_pretrained(args["pretrain_model_path"], from_pt=True, num_labels=len(list(tag2id.keys()))) # optimizer Adam optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08) # we do not have one-hot vectors, we can use sparse categorical cross entropy and accuracy loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) model.summary() model.fit(train_x, train_y, epochs=epoch, verbose=1, batch_size=batch_size, validation_data=(dev_x, dev_y), validation_batch_size=batch_size ) # , validation_split=0.1 # model save model_file = os.path.join(args["output_path"], "ner_model.h5") model.save_weights(model_file, overwrite=True) # save pb model tf.keras.models.save_model(model, args["pb_path"], save_format="tf") # 模型评价 precision, recall, f1 = model_evaluate_roberta(model, dev_x, dev_label_ori, tag2id, batch_size, dev_len) logger.info("model precision:{} recall:{} f1:{}".format(precision, recall, f1))