def evaluate(model, data_loader, raw_dataset, args): model.eval() all_start_logits = [] all_end_logits = [] tic_eval = time.time() for batch in data_loader: input_ids, _ = batch start_logits_tensor, end_logits_tensor = model(input_ids) for idx in range(start_logits_tensor.shape[0]): if len(all_start_logits) % 1000 == 0 and len(all_start_logits): print("Processing example: %d" % len(all_start_logits)) print('time per 1000:', time.time() - tic_eval) tic_eval = time.time() all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) all_predictions, all_nbest_json, scores_diff_json = compute_prediction( raw_dataset, data_loader.dataset, (all_start_logits, all_end_logits), args.version_2_with_negative, args.n_best_size, args.max_answer_length, args.null_score_diff_threshold) # Can also write all_nbest_json and scores_diff_json files if needed with open('prediction.json', "w", encoding='utf-8') as writer: writer.write( json.dumps(all_predictions, ensure_ascii=False, indent=4) + "\n") squad_evaluate(examples=[raw_data for raw_data in raw_dataset], preds=all_predictions, na_probs=scores_diff_json) model.train()
def evaluate(model, data_loader, args, global_step, write_predictions=False): model.eval() all_start_logits = [] all_end_logits = [] for batch in data_loader: input_ids = batch[0] start_logits_tensor, end_logits_tensor = model(input_ids) for idx in range(start_logits_tensor.shape[0]): all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) all_predictions, all_nbest_json, scores_diff_json = compute_prediction( data_loader.dataset.data, data_loader.dataset.new_data, (all_start_logits, all_end_logits), args.version_2_with_negative, args.n_best_size, args.max_answer_length, args.null_score_diff_threshold) # Can also write all_nbest_json and scores_diff_json files if needed if write_predictions: with open(f'{str(global_step)}_prediction.json', "w", encoding='utf-8') as writer: writer.write( json.dumps(all_predictions, ensure_ascii=False, indent=4) + "\n") squad_evaluate(examples=data_loader.dataset.data, preds=all_predictions, na_probs=scores_diff_json) model.train()
def predict(self, dataset, raw_dataset, collate_fn, args, do_eval=True): batch_sampler = paddle.io.BatchSampler(dataset, batch_size=args.batch_size, shuffle=False) data_loader = paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=0, return_list=True) outputs = [] all_start_logits = [] all_end_logits = [] for data in data_loader: output = self.predict_batch(data) outputs.append(output) if do_eval: all_start_logits.extend(list(output[0])) all_end_logits.extend(list(output[1])) if do_eval: all_predictions, all_nbest_json, scores_diff_json = compute_prediction( raw_dataset, data_loader.dataset, (all_start_logits, all_end_logits), args.version_2_with_negative, args.n_best_size, args.max_answer_length, args.null_score_diff_threshold) squad_evaluate(examples=[raw_data for raw_data in raw_dataset], preds=all_predictions, na_probs=scores_diff_json) return outputs
def evaluate(model, data_loader, is_test=False): model.eval() all_start_logits = [] all_end_logits = [] tic_eval = time.time() for batch in data_loader: input_ids, token_type_ids = batch start_logits_tensor, end_logits_tensor = model(input_ids, token_type_ids) for idx in range(start_logits_tensor.shape[0]): if len(all_start_logits) % 10 == 0 and len(all_start_logits): print("Processing example: %d" % len(all_start_logits)) print('time per 1000:', time.time() - tic_eval) tic_eval = time.time() all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) all_predictions, _, _ = compute_prediction( data_loader.dataset.data, data_loader.dataset.new_data, (all_start_logits, all_end_logits), False, 20, 30) if is_test: # Can also write all_nbest_json and scores_diff_json files if needed with open('prediction.json', "w", encoding='utf-8') as writer: writer.write( json.dumps( all_predictions, ensure_ascii=False, indent=4) + "\n") else: squad_evaluate( examples=data_loader.dataset.data, preds=all_predictions, is_whitespace_splited=False) count = 0 for example in data_loader.dataset.data: count += 1 print() print('问题:',example['question']) print('原文:',''.join(example['context'])) print('答案:',all_predictions[example['id']]) if count >= 5: break model.train()
def evaluate(model, data_loader, args, tokenizer, do_pred=False): model.eval() RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) all_results = [] tic_eval = time.time() for batch in data_loader: input_ids, segment_ids, unipue_ids = batch start_logits_tensor, end_logits_tensor = model(input_ids, segment_ids) for idx in range(unipue_ids.shape[0]): if len(all_results) % 1000 == 0 and len(all_results): print("Processing example: %d" % len(all_results)) print('time per 1000:', time.time() - tic_eval) tic_eval = time.time() unique_id = int(unipue_ids[idx]) start_logits = [float(x) for x in start_logits_tensor.numpy()[idx]] end_logits = [float(x) for x in end_logits_tensor.numpy()[idx]] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) all_predictions, _, _ = compute_predictions(data_loader.dataset.examples, data_loader.dataset.features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, False, 0.0, args.verbose, tokenizer, False) if do_pred: with open('prediction.json', "w", encoding='utf-8') as writer: writer.write( json.dumps(all_predictions, ensure_ascii=False, indent=4) + "\n") else: squad_evaluate(examples=data_loader.dataset.examples, preds=all_predictions, is_whitespace_splited=False) model.train()
def evaluate(model, data_loader, args): model.eval() all_start_logits = [] all_end_logits = [] tic_eval = time.time() for batch in data_loader: input_ids, token_type_ids = batch start_logits_tensor, end_logits_tensor = model(input_ids, token_type_ids) for idx in range(start_logits_tensor.shape[0]): if len(all_start_logits) % 1000 == 0 and len(all_start_logits): print("Processing example: %d" % len(all_start_logits)) print('time per 1000:', time.time() - tic_eval) tic_eval = time.time() all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) all_predictions, _, _ = compute_prediction( data_loader.dataset.data, data_loader.dataset.new_data, (all_start_logits, all_end_logits), False, args.n_best_size, args.max_answer_length) # Can also write all_nbest_json and scores_diff_json files if needed with open('prediction.json', "w", encoding='utf-8') as writer: writer.write( json.dumps( all_predictions, ensure_ascii=False, indent=4) + "\n") squad_evaluate( examples=data_loader.dataset.data, preds=all_predictions, is_whitespace_splited=False) model.train()
def evaluate(model, data_loader, args): model.eval() RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) all_results = [] tic_eval = time.time() for batch in data_loader: input_ids, segment_ids, unipue_ids = batch start_logits_tensor, end_logits_tensor = model(input_ids, segment_ids) for idx in range(unipue_ids.shape[0]): if len(all_results) % 1000 == 0 and len(all_results): print("Processing example: %d" % len(all_results)) print('time per 1000:', time.time() - tic_eval) tic_eval = time.time() unique_id = int(unipue_ids[idx]) start_logits = [float(x) for x in start_logits_tensor.numpy()[idx]] end_logits = [float(x) for x in end_logits_tensor.numpy()[idx]] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) all_predictions, all_nbest_json, scores_diff_json = compute_predictions( data_loader.dataset.examples, data_loader.dataset.features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, args.version_2_with_negative, args.null_score_diff_threshold, args.verbose, data_loader.dataset.tokenizer) squad_evaluate(data_loader.dataset.examples, all_predictions, scores_diff_json, 1.0) model.train()
def evaluate(model, raw_dataset, dataset, data_loader, args, do_eval=True): model.eval() all_start_logits = [] all_end_logits = [] tic_eval = time.time() for batch in data_loader: start_logits, end_logits = model(**batch) for idx in range(start_logits.shape[0]): if len(all_start_logits) % 1000 == 0 and len(all_start_logits): logger.info("Processing example: %d" % len(all_start_logits)) logger.info('time per 1000: %s' % (time.time() - tic_eval)) tic_eval = time.time() all_start_logits.append(start_logits.numpy()[idx]) all_end_logits.append(end_logits.numpy()[idx]) all_predictions, _, _ = compute_prediction( raw_dataset, dataset, (all_start_logits, all_end_logits), False, args.n_best_size, args.max_answer_length) mode = 'validation' if do_eval else 'test' if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if do_eval: filename = os.path.join(args.output_dir, 'prediction_validation.json') else: filename = os.path.join(args.output_dir, 'cmrc2018_predict.json') with open(filename, "w", encoding='utf-8') as writer: writer.write( json.dumps(all_predictions, ensure_ascii=False, indent=4) + "\n") if do_eval: res = squad_evaluate(examples=[raw_data for raw_data in raw_dataset], preds=all_predictions, is_whitespace_splited=False) model.train() return res['exact'], res['f1'] model.train()
def evaluate(model, criterion, data_loader, width_mult=1.0): model.eval() all_start_logits = [] all_end_logits = [] metric.reset() for batch in data_loader: if "cmrc2018" in task_name: input_ids, token_type_ids = batch['input_ids'], batch[ 'token_type_ids'] logits = model( input_ids, token_type_ids, attention_mask=[None, None]) if width_mult == 100: start_logits_tensor, end_logits_tensor = logits else: start_logits_tensor, end_logits_tensor = logits[0] for idx in range(start_logits_tensor.shape[0]): if len(all_start_logits) % 1000 == 0 and len( all_start_logits): logger.info("Processing example: %d" % len(all_start_logits)) all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) else: input_ids, segment_ids, labels = batch['input_ids'], batch[ 'token_type_ids'], batch['labels'] logits = model( input_ids, segment_ids, attention_mask=[None, None]) if isinstance(logits, tuple): logits = logits[0] loss = criterion(logits, labels) if task_name == "msra_ner": preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( batch['seq_len'], preds, batch['labels']) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) else: correct = metric.compute(logits, labels) metric.update(correct) if "cmrc2018" in task_name: n_best_size = 20 max_answer_length = 50 all_predictions, _, _ = compute_prediction( self.eval_examples, self.eval_dataset, (all_start_logits, all_end_logits), False, n_best_size, max_answer_length) res = squad_evaluate( examples=[raw_data for raw_data in self.eval_examples], preds=all_predictions, is_whitespace_splited=False) if width_mult == 100: logger.info("teacher model, EM: %f, F1: %f" % (res['exact'], res['f1'])) else: logger.info("width_mult: %s, EM: %f, F1: %f, " % (str(width_mult), res['exact'], res['f1'])) res = res['exact'] else: res = metric.accumulate() # Teacher model's evaluation if task_name == "msra_ner": if width_mult == 100: logger.info( "teacher model, eval loss: %f, precision: %f, recall: %f, f1_score: %f" % (paddle.mean(loss).numpy(), res[0], res[1], res[2])) else: logger.info( "width_mult: %s, eval loss: %f, precision: %f, recall: %f, f1_score: %f" % (str(width_mult), paddle.mean(loss).numpy(), res[0], res[1], res[2])) res = res[2] else: if width_mult == 100: logger.info("teacher model, eval loss: %f, acc: %s, " % (loss.numpy(), res)) else: logger.info("width_mult: %s, eval loss: %f, acc: %s, " % (str(width_mult), loss.numpy(), res)) model.train() return res
if __name__ == '__main__': args = get_args() if args.language == 'ch': ref_ans = read_dataset(args.golden_path) pred_ans = read_model_prediction(args.pred_file) F1, EM, TOTAL, SKIP = evaluate_ch(ref_ans, pred_ans) output_result = OrderedDict() output_result['F1'] = '%.3f' % F1 output_result['EM'] = '%.3f' % EM output_result['TOTAL'] = TOTAL output_result['SKIP'] = SKIP print(json.dumps(output_result)) else: ref_ans = read_dataset(args.golden_path) pred_ans = read_temp(args.pred_file) res = [] for i in ref_ans: ins = ref_ans[i] ins['id'] = str(ins['sent_id']) ins['answers'] = [ins['sent_label']] if ins['answers'] == [""]: ins['is_impossible'] = True else: ins['is_impossible'] = False res.append(ins) squad_evaluate(examples=res, preds=pred_ans)
def main(args): paddle.enable_static() place = paddle.set_device('ipu') set_seed(args.seed) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() # The sharding of encoder layers if args.num_hidden_layers == 12: attn_ipu_index = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] ff_ipu_index = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] else: raise Exception("Only support num_hidden_layers = 12") bert_config = { k: getattr(args, k) for k in IpuBertConfig._fields if hasattr(args, k) } bert_config['embeddings_scope'] = DeviceScope(0, 0, "Embedding") bert_config['attn_scopes'] = [ DeviceScope(attn_ipu_index[i], attn_ipu_index[i]) for i in range(args.num_hidden_layers) ] bert_config['ff_scopes'] = [ DeviceScope(ff_ipu_index[i], ff_ipu_index[i]) for i in range(args.num_hidden_layers) ] bert_config['layers_per_ipu'] = [6, 6] config = IpuBertConfig(**bert_config) # custom_ops custom_ops = load_custom_ops() logging.info("building model") if args.is_training: [indices, segments, positions, input_mask, start_labels, end_labels] = create_data_holder(args) else: [indices, segments, positions, input_mask] = create_data_holder(args) # Encoder Layers bert_model = BertModel(config, custom_ops) encoders, _ = bert_model(indices, segments, positions, input_mask) squad_scope = DeviceScope(args.num_ipus - 1, args.num_ipus - 1, "squad") with squad_scope: qa_cls = IpuBertForQuestionAnswering(args.hidden_size, args.seq_len) start_logits, end_logits = qa_cls(encoders) if args.is_training: acc_loss = IpuBertQAAccAndLoss(custom_ops) acc0, acc1, loss = acc_loss(start_logits, end_logits, start_labels, end_labels) # load squad dataset raw_dataset, data_loader = load_squad_dataset(args) total_samples = len(data_loader.dataset) max_steps = total_samples // args.batch_size * args.epochs logging.info("total samples: %d, total batch_size: %d, max steps: %d" % (total_samples, args.batch_size, max_steps)) if args.is_training: lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_steps) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, weight_decay=args.weight_decay, beta1=args.beta1, beta2=args.beta2, epsilon=args.adam_epsilon) optimizer.minimize(loss) # Static executor exe = paddle.static.Executor(place) exe.run(startup_program) # Set initial weights state_dict = main_program.state_dict() reset_state_dict = reset_program_state_dict(state_dict) paddle.static.set_program_state(main_program, reset_state_dict) if args.enable_load_params: logging.info(f'loading weights from: {args.load_params_path}') if not args.load_params_path.endswith('pdparams'): raise Exception('need pdparams file') with open(args.load_params_path, 'rb') as file: params = pickle.load(file) # Delete mlm and nsp weights if args.is_training and 'linear_72.w_0' in params: params.pop("linear_72.w_0") params.pop("linear_72.b_0") paddle.static.set_program_state(main_program, params) if args.tf_checkpoint: from load_tf_ckpt import load_initializers_from_tf logging.info(f'loading weights from: {args.tf_checkpoint}') initializers, _ = load_initializers_from_tf(args.tf_checkpoint, args) paddle.static.set_program_state(main_program, initializers) # Create ipu_strategy ipu_strategy = create_ipu_strategy(args) if args.is_training: feed_list = [ "indices", "segments", "positions", "input_mask", "start_labels", "end_labels" ] fetch_list = [loss.name, acc0.name, acc1.name] else: feed_list = ["indices", "segments", "positions", "input_mask"] fetch_list = [start_logits.name, end_logits.name] ipu_compiler = paddle.static.IpuCompiledProgram( main_program, ipu_strategy=ipu_strategy) logging.info(f'start compiling, please wait some minutes') cur_time = time.time() main_program = ipu_compiler.compile(feed_list, fetch_list) time_cost = time.time() - cur_time logging.info(f'finish compiling! time cost: {time_cost}') if args.is_training: global_step = 0 batch_start = time.time() for epoch in range(args.epochs): for batch in data_loader: global_step += 1 feed = { "indices": batch[0], "segments": batch[1], "positions": batch[2], "input_mask": batch[3], "start_labels": batch[4], "end_labels": batch[5], } lr_scheduler.step() train_start = time.time() outputs = exe.run(main_program, feed=feed, fetch_list=fetch_list, use_program_cache=True) train_cost = time.time() - train_start total_cost = time.time() - batch_start tput = args.batch_size / total_cost if args.wandb: wandb.log({ "epoch": epoch, "global_step": global_step, "loss": np.mean(outputs[0]), "accuracy": np.mean(outputs[1:]), "train_cost": train_cost, "total_cost": total_cost, "throughput": tput, "learning_rate": lr_scheduler(), }) if global_step % args.logging_steps == 0: logging.info({ "epoch": epoch, "global_step": global_step, "loss": np.mean(outputs[0]), "accuracy": np.mean(outputs[1:]), "train_cost": train_cost, "total_cost": total_cost, "throughput": tput, "learning_rate": lr_scheduler(), }) batch_start = time.time() # save final state ipu_compiler._backend.weights_to_host() paddle.static.save(main_program.org_program, os.path.join(args.output_dir, 'Final_model')) if not args.is_training: all_start_logits = [] all_end_logits = [] for step, batch in enumerate(data_loader): if step % args.logging_steps == 0: logging.info(f'running step: {step}') real_len = np.array(batch[0]).shape[0] # padding zeros if needed if real_len < args.batch_size: batch = [np.asarray(x) for x in batch] pad0 = np.zeros([args.batch_size - real_len, args.seq_len]).astype(batch[0].dtype) batch[0] = np.vstack((batch[0], pad0)) batch[1] = np.vstack((batch[1], pad0)) batch[2] = np.vstack((batch[2], pad0)) pad1 = np.zeros( [args.batch_size - real_len, 1, 1, args.seq_len]) - 1e3 pad1 = pad1.astype(batch[3].dtype) batch[3] = np.vstack((batch[3], pad1)) feed = { "indices": batch[0], "segments": batch[1], "positions": batch[2], "input_mask": batch[3], } start_logits, end_logits = exe.run(main_program, feed=feed, fetch_list=fetch_list) start_logits = start_logits.reshape([-1, args.seq_len]) end_logits = end_logits.reshape([-1, args.seq_len]) for idx in range(real_len): all_start_logits.append(start_logits[idx]) all_end_logits.append(end_logits[idx]) # evaluate results all_predictions, all_nbest_json, scores_diff_json = compute_prediction( raw_dataset, data_loader.dataset, (all_start_logits, all_end_logits)) squad_evaluate( examples=[raw_data for raw_data in raw_dataset], preds=all_predictions, na_probs=scores_diff_json) # write results to file with open('squad_prediction.json', "w", encoding='utf-8') as writer: writer.write( json.dumps( all_predictions, ensure_ascii=False, indent=4) + "\n")
def evaluate(args, is_test=True): # 加载模型 model_state = paddle.load(args.model_path) model = ErnieForQuestionAnswering.from_pretrained(args.model_name) model.load_dict(model_state) model.eval() # 加载数据 train_ds, dev_ds, test_ds = load_dataset('dureader_robust', splits=('train', 'dev', 'test')) tokenizer = paddlenlp.transformers.ErnieTokenizer.from_pretrained( args.model_name) test_trans_func = partial(prepare_validation_features, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, tokenizer=tokenizer) test_ds.map(test_trans_func, batched=True, num_workers=4) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) test_data_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=test_batchify_fn, return_list=True) all_start_logits = [] all_end_logits = [] tic_eval = time.time() for batch in test_data_loader: input_ids, token_type_ids = batch start_logits_tensor, end_logits_tensor = model(input_ids, token_type_ids) for idx in range(start_logits_tensor.shape[0]): if len(all_start_logits) % 10 == 0 and len(all_start_logits): print("Processing example: %d" % len(all_start_logits)) print('time per 1000:', time.time() - tic_eval) tic_eval = time.time() all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) all_predictions, _, _ = compute_prediction( test_data_loader.dataset.data, test_data_loader.dataset.new_data, (all_start_logits, all_end_logits), False, 20, 30) if is_test: # Can also write all_nbest_json and scores_diff_json files if needed with open('prediction.json', "w", encoding='utf-8') as writer: writer.write( json.dumps(all_predictions, ensure_ascii=False, indent=4) + "\n") else: squad_evaluate(examples=test_data_loader.dataset.data, preds=all_predictions, is_whitespace_splited=False) count = 0 for example in test_data_loader.dataset.data: count += 1 print() print('问题:', example['question']) print('原文:', ''.join(example['context'])) print('答案:', all_predictions[example['id']]) if count >= 5: break model.train()
def predict(self, dataset, tokenizer, batchify_fn, args, dev_example=None, dev_ds_ori=None): if args.collect_shape: self.set_dynamic_shape(args.max_seq_length, args.batch_size) if args.task_name == "cmrc2018": dataset_removed = dataset.remove_columns( ["offset_mapping", "attention_mask", "example_id"]) sample_num = len(dataset) batches = [] for i in range(0, sample_num, args.batch_size): batch_size = min(args.batch_size, sample_num - i) batch = [dataset_removed[i + j] for j in range(batch_size)] batches.append(batch) else: sample_num = len(dataset) batches = [] for i in range(0, sample_num, args.batch_size): batch_size = min(args.batch_size, sample_num - i) batch = [dataset[i + j] for j in range(batch_size)] batches.append(batch) if args.perf: for i, batch in enumerate(batches): batch = batchify_fn(batch) input_ids, segment_ids = batch["input_ids"].numpy( ), batch["token_type_ids"].numpy() output = self.predict_batch([input_ids, segment_ids]) if i > args.perf_warmup_steps: break time1 = time.time() nums = 0 for batch in batches: batch = batchify_fn(batch) input_ids, segment_ids = batch["input_ids"].numpy( ), batch["token_type_ids"].numpy() nums = nums + input_ids.shape[0] output = self.predict_batch([input_ids, segment_ids]) total_time = time.time() - time1 print("task name: %s, sample nums: %s, time: %s, QPS: %s " % (args.task_name, nums, total_time, nums / total_time)) else: if args.task_name == "msra_ner": metric = ChunkEvaluator(label_list=args.label_list) metric.reset() all_predictions = [] batch_num = len(dataset['input_ids']) for batch in batches: batch = batchify_fn(batch) input_ids, segment_ids = batch["input_ids"].numpy( ), batch["token_type_ids"].numpy() output = self.predict_batch([input_ids, segment_ids])[0] preds = np.argmax(output, axis=2) all_predictions.append(preds.tolist()) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( batch["seq_len"], paddle.to_tensor(preds), batch["labels"]) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) res = metric.accumulate() print("task name: %s, (precision, recall, f1): %s, " % (args.task_name, res)) elif args.task_name == "cmrc2018": all_start_logits = [] all_end_logits = [] for batch in batches: batch = batchify_fn(batch) input_ids, segment_ids = batch["input_ids"].numpy( ), batch["token_type_ids"].numpy() start_logits, end_logits = self.predict_batch( [input_ids, segment_ids]) for idx in range(start_logits.shape[0]): if len(all_start_logits) % 1000 == 0 and len( all_start_logits): print("Processing example: %d" % len(all_start_logits)) all_start_logits.append(start_logits[idx]) all_end_logits.append(end_logits[idx]) all_predictions, _, _ = compute_prediction( dev_example, dataset, (all_start_logits, all_end_logits), False, args.n_best_size, args.max_answer_length) res = squad_evaluate( examples=[raw_data for raw_data in dev_example], preds=all_predictions, is_whitespace_splited=False) print("task name: %s, EM: %s, F1: %s" % (args.task_name, res['exact'], res['f1'])) return all_predictions else: all_predictions = [] metric = METRIC_CLASSES[args.task_name]() metric.reset() for i, batch in enumerate(batches): batch = batchify_fn(batch) output = self.predict_batch([ batch["input_ids"].numpy(), batch["token_type_ids"].numpy() ])[0] preds = np.argmax(output, axis=1) all_predictions.append(preds.tolist()) correct = metric.compute(paddle.to_tensor(output), batch["labels"]) metric.update(correct) res = metric.accumulate() print("task name: %s, acc: %s, " % (args.task_name, res)) return all_predictions
def compute_metrics(p: EvalPrediction): ret = squad_evaluate(examples=p.label_ids, preds=p.predictions, is_whitespace_splited=False) return dict(ret)