def test_TFBertForPreTraining(self): from transformers import BertConfig, TFBertForPreTraining keras.backend.clear_session() # pretrained_weights = 'bert-base-uncased' tokenizer_file = 'bert_bert-base-uncased.pickle' tokenizer = self._get_tokenzier(tokenizer_file) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) config = BertConfig() model = TFBertForPreTraining(config) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
def run_chinese_bert(): tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') model = TFBertForPreTraining.from_pretrained('bert-base-chinese', output_attentions=True) # print(f"model config:{model.config}") # model.config.output_attentions = True input_ids = tf.constant(tokenizer.encode("习近平总书记讲到三个关联“做好疫情防控工作,直接关系人民生命安全和身体健康,直接关系经济社会大局稳定,也事关我国对外开放”"))[None,:] # Batch size 1 outputs = model(input_ids) print(f"outputs shapes:{outputs}") logits = outputs[0] # print(logits) # print("-" * 30) # print(f"outputs:{outputs}") print("-" * 30) # print(tf.math.argmax(logits, axis=2)[0, :]) print(tokenizer.decode(tf.math.argmax(logits, axis=2)[0, :]))
def test_TFBertForPreTraining(self): from transformers import BertTokenizer, TFBertForPreTraining pretrained_weights = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFBertForPreTraining.from_pretrained(pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
def get_model(label_list): K.clear_session() bert_model = TFBertForPreTraining.from_pretrained(bert_path, from_pt=True) input_indices = Input(shape=(None, ), dtype='int32') bert_output = bert_model(input_indices) projection_logits = bert_output[0] bert_cls = Lambda(lambda x: x[:, 0])( projection_logits) # 取出[CLS]对应的向量用来做分类 dropout = Dropout(0.5)(bert_cls) output = Dense(len(label_list), activation='softmax')(dropout) model = Model(input_indices, output) model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(1e-5), #用足够小的学习率 metrics=['accuracy']) print(model.summary()) return model
from official.modeling import tf_utils # # BERT模型 # `BERT`模型基于`Transformer`的编码器,由12层或更多的`EncoderLayer`组成,如下图所示,其中输入数据在模型中的维度变化如图右所示: # <img src="../images/bert结构.png" width="80%"> # + # huggingface 的 bert 模型 from transformers import BertTokenizer, TFBertForPreTraining tokenizer = BertTokenizer.from_pretrained( '../models/bert/vocabulary.txt', # 从保存有词汇表的本地文件载入 do_lower_case=True) model = TFBertForPreTraining.from_pretrained( "../../H/models/huggingface/bert-base-uncased/") # 模型所有的参数,以 list 的形式 params = model.weights # - model.summary() # ## 模型配置 # 创建模型时需要指定的参数: # - `vocab_size`, 词汇表的大小,用于词嵌入矩阵 # - `hidden_size=768`, 编码层的尺寸 # - `num_hidden_layers=12`, 编码层的层数 # - `num_attention_heads=12`, 多头注意力的头数 # - `intermediate_size=3072`, 编码器中前向层的尺寸 # - `hidden_act="gelu"`, 编码器中激活函数
print() print( "===============================================================") print("Processing model:", model_name) print("Will be saved to:", cache_dir, "\n") time.sleep(2) if model_name.startswith("bert"): is_encoder_decoder = False if "uncased" in model_name: vocab_size = 30522 else: vocab_size = 28996 model = TFBertForPreTraining.from_pretrained( model_name, cache_dir=model_cache_dir, **model_kwargs) tokenizer = BertTokenizer.from_pretrained( model_name, cache_dir=token_cache_dir) else: is_encoder_decoder = True vocab_size = 50265 model = TFBartForConditionalGeneration.from_pretrained( model_name, cache_dir=model_cache_dir, **model_kwargs) tokenizer = BartTokenizer.from_pretrained( model_name, cache_dir=token_cache_dir) print("Exporting Model to SavedModel at:", pb_model_dir) hf_model = HFModel( model,
def parse_text(filename): np.random.seed(42) max_seq_length = 512 seq_length = 512 max_predictions_per_seq = 20 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Store the document in ram with open(filename, "r") as infile: # Grab two sentences text = infile.read() lines = [ line for line in text.split("\n \n \n") if (len(line) > 0 and not line.isspace()) ] # TODO: Tokenizer batch encode breakpoint() # Let's just grab one document for now. idx = np.random.randint(len(lines) - 1) line1, line2 = lines[idx], lines[idx + 1] line = np.random.choice(lines) # And choose a subset of tokens. tokens = line.split(" ") if len(tokens) < seq_length: seq_length = len(tokens) start = 0 else: start = np.random.randint(len(tokens) - seq_length) span = np.array(max_seq_length) span = np.empty(512, dtype=str) span[:seq_length] = np.array(tokens[start:start + seq_length]) # Now tokenize mask = np.random.choice(np.arange(seq_length), size=max_predictions_per_seq, replace=False) span[mask] = "[MASK]" # TODO: Corrupt some of these instead of mask input_ids = span # 0 represents a padding token attention_mask = np.ones(max_seq_length) attention_mask[seq_length:] = 0 # 0 represents sentence A, 1 is sentence B token_type_ids = np.ones(max_seq_length) # token_type_ids = print(span) my_dict = tokenizer.encode_plus(line, line, return_tensors="tf") input_dict = { "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, } # 80% should remain the same # 10% should be masked # 10% should be corrupted model = TFBertForPreTraining.from_pretrained("bert-base-uncased") model(my_dict) breakpoint()