Beispiel #1
0
def task_3():
    # 任务三:句子预测任务
    question, text = "里昂是谁", "里昂是一个杀手"
    sample = (question, text)

    tokenizer = BertTokenizer.from_pretrained(bert_path)
    sen_code = tokenizer.batch_encode_plus(
        [sample])  # 上下句结合可以这样传参 List[Tuple[str, str]]
    tokens_tensor = torch.tensor(sen_code["input_ids"])
    segments_tensor = torch.tensor(sen_code["token_type_ids"])

    model_config = BertConfig.from_pretrained(bert_path)
    # model_config.num_labels = 2  # 最终有两个输出,初始位置和结束位置
    # model = BertForQuestionAnswering.from_pretrained(bert_path)  # 这是一种加载方式
    model = BertForQuestionAnswering(model_config)  # 这是另一种加载方式

    model.eval()
    outputs = model(tokens_tensor, segments_tensor)
    start_pos, end_pos = outputs.start_logits, outputs.end_logits

    for idx, (start, end) in enumerate(
            zip(start_pos.argmax(axis=1), end_pos.argmax(axis=1))):
        all_tokens = tokenizer.convert_ids_to_tokens(
            sen_code["input_ids"][idx])  # 进行逆编码,得到原始的token
        print(
            all_tokens
        )  # ['[CLS]', '里', '昂', '是', '谁', '[SEP]', '里', '昂', '是', '一', '个', '杀', '手', '[SEP]']
        if start <= end:
            answer = " ".join(all_tokens[start:end + 1])  # 对输出的答案进行解码的过程
            # 每次执行的结果不一致,这里因为没有经过微调,所以效果不是很好,输出结果不佳,下面的输出是其中的一种。
            print(answer)  # 一 个 杀 手 [SEP]
        else:
            print("预测的有问题哦!")
Beispiel #2
0
class BERT_PyTorch_SUT():
    def __init__(self):
        print("Loading BERT configs...")
        with open("bert_config.json") as f:
            config_json = json.load(f)

        config = BertConfig(
            attention_probs_dropout_prob=config_json["attention_probs_dropout_prob"],
            hidden_act=config_json["hidden_act"],
            hidden_dropout_prob=config_json["hidden_dropout_prob"],
            hidden_size=config_json["hidden_size"],
            initializer_range=config_json["initializer_range"],
            intermediate_size=config_json["intermediate_size"],
            max_position_embeddings=config_json["max_position_embeddings"],
            num_attention_heads=config_json["num_attention_heads"],
            num_hidden_layers=config_json["num_hidden_layers"],
            type_vocab_size=config_json["type_vocab_size"],
            vocab_size=config_json["vocab_size"])

        print("Loading PyTorch model...")
        self.model = BertForQuestionAnswering(config)
        self.model.eval()
        self.model.cuda()
        self.model.load_state_dict(torch.load("build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch"))

        print("Constructing SUT...")
        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies)
        print("Finished constructing SUT.")

        self.qsl = get_squad_QSL()

    def issue_queries(self, query_samples):
        with torch.no_grad():
            for i in range(len(query_samples)):
                eval_features = self.qsl.get_features(query_samples[i].index)
                start_scores, end_scores = self.model.forward(input_ids=torch.LongTensor(eval_features.input_ids).unsqueeze(0).cuda(),
                    attention_mask=torch.LongTensor(eval_features.input_mask).unsqueeze(0).cuda(),
                    token_type_ids=torch.LongTensor(eval_features.segment_ids).unsqueeze(0).cuda())
                output = torch.stack([start_scores, end_scores], axis=-1).squeeze(0).cpu().numpy()

                response_array = array.array("B", output.tobytes())
                bi = response_array.buffer_info()
                response = lg.QuerySampleResponse(query_samples[i].id, bi[0], bi[1])
                lg.QuerySamplesComplete([response])

    def flush_queries(self):
        pass

    def process_latencies(self, latencies_ns):
        pass

    def __del__(self):
        print("Finished destroying SUT.")
 def create_and_check_for_question_answering(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = BertForQuestionAnswering(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         start_positions=sequence_labels,
         end_positions=sequence_labels,
     )
     self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
     self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
 def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = BertForQuestionAnswering(config=config)
     model.eval()
     loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                            start_positions=sequence_labels, end_positions=sequence_labels)
     result = {
         "loss": loss,
         "start_logits": start_logits,
         "end_logits": end_logits,
     }
     self.parent.assertListEqual(
         list(result["start_logits"].size()),
         [self.batch_size, self.seq_length])
     self.parent.assertListEqual(
         list(result["end_logits"].size()),
         [self.batch_size, self.seq_length])
     self.check_loss_output(result)
Beispiel #5
0
def demo4():
    from transformers import BertTokenizer, BertForQuestionAnswering
    import torch

    MODEL_PATH = r"D:\transformr_files\bert-base-uncased/"
    # 实例化tokenizer
    tokenizer = BertTokenizer.from_pretrained(
        r"D:\transformr_files\bert-base-uncased\bert-base-uncased-vocab.txt")
    # 导入bert的model_config
    model_config = transformers.BertConfig.from_pretrained(MODEL_PATH)
    # 首先新建bert_model
    bert_model = transformers.BertModel.from_pretrained(MODEL_PATH,
                                                        config=model_config)
    # 最终有两个输出,初始位置和结束位置(下面有解释)
    model_config.num_labels = 2
    # 同样根据bert的model_config新建BertForQuestionAnswering
    model = BertForQuestionAnswering(model_config)
    model.bert = bert_model

    # 设定模式
    model.eval()
    question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
    # 获取input_ids编码
    input_ids = tokenizer.encode(question, text)
    # 手动进行token_type_ids编码,可用encode_plus代替
    # input_ids = tokenizer.encode_plus("i like you", "but not him")
    token_type_ids = [
        0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))
    ]
    # 得到评分,
    start_scores, end_scores = model(torch.tensor([input_ids]),
                                     token_type_ids=torch.tensor(
                                         [token_type_ids]))
    # 进行逆编码,得到原始的token
    all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', 'henson', 'was', 'a', 'nice', 'puppet', '[SEP]']
    # 对输出的答案进行解码的过程
    answer = ' '.join(
        all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) + 1])
    # assert answer == "a nice puppet"
    # 这里因为没有经过微调,所以效果不是很好,输出结果不佳。
    print(answer)
Beispiel #6
0
                    help="model para after pretrained")

args = parser.parse_args()
args.n_gpu = torch.cuda.device_count()
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
device = torch.device(
    "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.device = device
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                          do_lower_case=False)
config = BertConfig.from_pretrained('bert-base-chinese')
model = BertForQuestionAnswering(config)
model_state_dict = args.state_dict
model.load_state_dict(torch.load(model_state_dict))
model.to(args.device)
model.eval()
input_file = args.predict_file


def handle_file(input_file, context, question):
    orig_data = {"data": [{"paragraphs": [{"context": context, "qas": []}]}]}
    for i in range(len(question)):
        orig_data["data"][0]['paragraphs'][0]['qas'].append({
            'question':
            question[i],
            'id':
            str(i)
        })
    with open(input_file, "w", encoding='utf-8') as writer:
        writer.write(
            json.dumps(orig_data, indent=4, ensure_ascii=False) + "\n")