def task_3(): # 任务三:句子预测任务 question, text = "里昂是谁", "里昂是一个杀手" sample = (question, text) tokenizer = BertTokenizer.from_pretrained(bert_path) sen_code = tokenizer.batch_encode_plus( [sample]) # 上下句结合可以这样传参 List[Tuple[str, str]] tokens_tensor = torch.tensor(sen_code["input_ids"]) segments_tensor = torch.tensor(sen_code["token_type_ids"]) model_config = BertConfig.from_pretrained(bert_path) # model_config.num_labels = 2 # 最终有两个输出,初始位置和结束位置 # model = BertForQuestionAnswering.from_pretrained(bert_path) # 这是一种加载方式 model = BertForQuestionAnswering(model_config) # 这是另一种加载方式 model.eval() outputs = model(tokens_tensor, segments_tensor) start_pos, end_pos = outputs.start_logits, outputs.end_logits for idx, (start, end) in enumerate( zip(start_pos.argmax(axis=1), end_pos.argmax(axis=1))): all_tokens = tokenizer.convert_ids_to_tokens( sen_code["input_ids"][idx]) # 进行逆编码,得到原始的token print( all_tokens ) # ['[CLS]', '里', '昂', '是', '谁', '[SEP]', '里', '昂', '是', '一', '个', '杀', '手', '[SEP]'] if start <= end: answer = " ".join(all_tokens[start:end + 1]) # 对输出的答案进行解码的过程 # 每次执行的结果不一致,这里因为没有经过微调,所以效果不是很好,输出结果不佳,下面的输出是其中的一种。 print(answer) # 一 个 杀 手 [SEP] else: print("预测的有问题哦!")
class BERT_PyTorch_SUT(): def __init__(self): print("Loading BERT configs...") with open("bert_config.json") as f: config_json = json.load(f) config = BertConfig( attention_probs_dropout_prob=config_json["attention_probs_dropout_prob"], hidden_act=config_json["hidden_act"], hidden_dropout_prob=config_json["hidden_dropout_prob"], hidden_size=config_json["hidden_size"], initializer_range=config_json["initializer_range"], intermediate_size=config_json["intermediate_size"], max_position_embeddings=config_json["max_position_embeddings"], num_attention_heads=config_json["num_attention_heads"], num_hidden_layers=config_json["num_hidden_layers"], type_vocab_size=config_json["type_vocab_size"], vocab_size=config_json["vocab_size"]) print("Loading PyTorch model...") self.model = BertForQuestionAnswering(config) self.model.eval() self.model.cuda() self.model.load_state_dict(torch.load("build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch")) print("Constructing SUT...") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) print("Finished constructing SUT.") self.qsl = get_squad_QSL() def issue_queries(self, query_samples): with torch.no_grad(): for i in range(len(query_samples)): eval_features = self.qsl.get_features(query_samples[i].index) start_scores, end_scores = self.model.forward(input_ids=torch.LongTensor(eval_features.input_ids).unsqueeze(0).cuda(), attention_mask=torch.LongTensor(eval_features.input_mask).unsqueeze(0).cuda(), token_type_ids=torch.LongTensor(eval_features.segment_ids).unsqueeze(0).cuda()) output = torch.stack([start_scores, end_scores], axis=-1).squeeze(0).cpu().numpy() response_array = array.array("B", output.tobytes()) bi = response_array.buffer_info() response = lg.QuerySampleResponse(query_samples[i].id, bi[0], bi[1]) lg.QuerySamplesComplete([response]) def flush_queries(self): pass def process_latencies(self, latencies_ns): pass def __del__(self): print("Finished destroying SUT.")
def create_and_check_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = BertForQuestionAnswering(config=config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels, ) self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForQuestionAnswering(config=config) model.eval() loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels) result = { "loss": loss, "start_logits": start_logits, "end_logits": end_logits, } self.parent.assertListEqual( list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual( list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result)
def demo4(): from transformers import BertTokenizer, BertForQuestionAnswering import torch MODEL_PATH = r"D:\transformr_files\bert-base-uncased/" # 实例化tokenizer tokenizer = BertTokenizer.from_pretrained( r"D:\transformr_files\bert-base-uncased\bert-base-uncased-vocab.txt") # 导入bert的model_config model_config = transformers.BertConfig.from_pretrained(MODEL_PATH) # 首先新建bert_model bert_model = transformers.BertModel.from_pretrained(MODEL_PATH, config=model_config) # 最终有两个输出,初始位置和结束位置(下面有解释) model_config.num_labels = 2 # 同样根据bert的model_config新建BertForQuestionAnswering model = BertForQuestionAnswering(model_config) model.bert = bert_model # 设定模式 model.eval() question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" # 获取input_ids编码 input_ids = tokenizer.encode(question, text) # 手动进行token_type_ids编码,可用encode_plus代替 # input_ids = tokenizer.encode_plus("i like you", "but not him") token_type_ids = [ 0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids)) ] # 得到评分, start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor( [token_type_ids])) # 进行逆编码,得到原始的token all_tokens = tokenizer.convert_ids_to_tokens(input_ids) # ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', 'henson', 'was', 'a', 'nice', 'puppet', '[SEP]'] # 对输出的答案进行解码的过程 answer = ' '.join( all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) + 1]) # assert answer == "a nice puppet" # 这里因为没有经过微调,所以效果不是很好,输出结果不佳。 print(answer)
help="model para after pretrained") args = parser.parse_args() args.n_gpu = torch.cuda.device_count() args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.device = device tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=False) config = BertConfig.from_pretrained('bert-base-chinese') model = BertForQuestionAnswering(config) model_state_dict = args.state_dict model.load_state_dict(torch.load(model_state_dict)) model.to(args.device) model.eval() input_file = args.predict_file def handle_file(input_file, context, question): orig_data = {"data": [{"paragraphs": [{"context": context, "qas": []}]}]} for i in range(len(question)): orig_data["data"][0]['paragraphs'][0]['qas'].append({ 'question': question[i], 'id': str(i) }) with open(input_file, "w", encoding='utf-8') as writer: writer.write( json.dumps(orig_data, indent=4, ensure_ascii=False) + "\n")