def test_demo(self): import torch data = DataSet({ 'x1': [[0, 1], [2]], 'x2': [[3], [2, 4, 5] ], 'y': [0, 1] }) data.set_target('y') # 所有的collect_fn函数都接受list[(ind1, instance1), (ind2, instance2), ...]作为输入,其中ind1/ind2是该instance在dataset中 # 的index,instance1/instance2是这次batch取出来的数据,包含了所有的field. def concat_collect_fn(ins_list): x1 = [ins['x1'] for ind,ins in ins_list] x2 = [ins['x2'] for ind,ins in ins_list] xs = [] for i in range(len(ins_list)): xs.append(torch.LongTensor(x1[i] + x2[i])) # 需要自行pad并转换为tensor,但不需要移动到gpu arr = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=0) b_x = {'x': arr} b_y = {} # 返回值一定是两个dict,第一个dict的值会认为是input,第二个dict的值会认为是target. 若名称与已有input或target重复,则 # 采用返回值。 return b_x, b_y data.add_collect_fn(concat_collect_fn) for batch_x, batch_y in DataSetIter(data, sampler=SequentialSampler(), batch_size=2): print("batch_x:", batch_x) print("batch_y:", batch_y) # batch_x: {'x': tensor([[0, 1, 3, 0], # [2, 2, 4, 5]])} # batch_y: {'y': array([0, 1])} # 如果取batch过程含有一些参数,可以通过类来实现 class ConCollectFn: def __init__(self, max_len=3): self.max_len = max_len def __call__(self, ins_list): x1 = [ins['x1'] for ind, ins in ins_list] x2 = [ins['x2'] for ind, ins in ins_list] xs = [] for i in range(len(ins_list)): xs.append(torch.LongTensor(x1[i] + x2[i])[:self.max_len]) arr = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=0) b_x = {'x': arr} b_y = {} return b_x, b_y data.delete_collect_fn() # 删除之前的collect_fn data.add_collect_fn(ConCollectFn(max_len=3)) for batch_x, batch_y in DataSetIter(data, sampler=SequentialSampler(), batch_size=2): print("batch_x:", batch_x) print("batch_y:", batch_y)
def test_udf_padder(self): from fastNLP.core.field import Padder alphas = list('abcdefghijk') class UDFPadder(Padder): def __init__(self): super().__init__() def __call__(self, contents, field_name, field_ele_dtype, dim): results = [alphas[:con] for con in contents] return results batch_size = 32 num_samples = 1000 dataset = generate_fake_dataset(num_samples) contents = np.random.randint(5, size=(num_samples)) dataset.add_field('test', contents, is_input=True, padder=UDFPadder(), ignore_type=True) batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_x, batch_y in batch: test = batch_x['test'] indices = batch.cur_batch_indices cons = contents[indices] for con, t in zip(cons, test): self.assertEqual(alphas[:con], t)
def get_predictions(pred_model, input_data, batch_size, num_workers=4): texts = list(list(map(lambda x: vocabs['char'].to_word(x), sample['chars'])) for sample in input_data) seq_lens = [sample['seq_len'] for sample in input_data] pred_model.to(device) sampler = SequentialSampler() data_iterator = DataSetIter(dataset=input_data, batch_size=batch_size, sampler=sampler, num_workers=num_workers) with torch.no_grad(): preds, golds = [], [] pred_model.eval() for batch_x, batch_y in data_iterator: _move_dict_value_to_device(batch_x, batch_y, device=device) x = _build_args(pred_model.forward, **batch_x) with torch.no_grad(): y = pred_model.forward(**x) preds.extend(list(map(list, y['pred'].cpu().numpy()))) golds.extend(list(map(list, batch_y['target'].cpu().numpy()))) pred_seqs = list(list(map(lambda _y: vocabs['label'].to_word(_y), pred)) for pred in preds) gold_seqs = list(list(map(lambda _y: vocabs['label'].to_word(_y), pred)) for pred in golds) case_result = [] for pred_seq, gold_seq, word_seq, seq_len in zip(pred_seqs, gold_seqs, texts, seq_lens): pred_seq = pred_seq[:seq_len] gold_seq = gold_seq[:seq_len] case_result.append((''.join(word_seq), extract_kvpairs_in_bmoes(gold_seq, word_seq), extract_kvpairs_in_bmoes(pred_seq, word_seq))) # output for case study os.makedirs(f'../output/case_study/{args.dataset}', exist_ok=True) fout = open(f'../output/case_study/{args.dataset}/{args.dataset}_bert{args.use_bert}_scheme{args.new_tag_scheme}_ple{args.ple_channel_num}_plstm{int(args.use_ple_lstm)}_trainrate{args.train_dataset_rate}.casestudy', 'w', encoding='utf8') for word_seq, gold_pair, pred_pair in case_result: fout.write(word_seq + '\n' + str(gold_pair) + '\n' + str(pred_pair) + '\n\n')
def test_collect_fn(self): batch_size = 32 num_samples = 1000 dataset = generate_fake_dataset(num_samples) dataset.set_input('1', '2') dataset.set_target('0', '3') fn = ConcatCollectFn() dataset.add_collect_fn(fn, inputs=['1', '2'], outputs=['12', 'seq_len'], is_input=True, is_target=False) batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler(), drop_last=True) for batch_x, batch_y in batch: for i in range(batch_size): # print(i) self.assertEqual(batch_x['12'][i].sum(), batch_x['1'][i].sum() + batch_x['2'][i].sum()) self.assertEqual(batch_x['seq_len'][i], (batch_x['1'][i] != 0).sum() + (batch_x['2'][i] != 0).sum())
def dump_model_result(config, model): tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb')) metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag') dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb")) data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False) model.cuda() eval_results = {} dev_data.set_input('tag') dev_data.set_target('seq_len') with torch.no_grad(): for i, (batch_x, batch_y) in enumerate(data_iterator): print('batch', i) #if i > 10: # break char = batch_x['char'].cuda() word = batch_x['word'].cuda() pos = batch_x['pos'].cuda() spo = batch_x['spo'].cuda() seq_len = batch_x['seq_len'].cuda() tag = batch_y['tag'].cuda() #pred = model(char, word, pos, spo, seq_len, tag) pred = model.predict(char, word, pos, spo, seq_len) # labels? #labels = idx2label(pred['pred'], tag_vocab.idx2word) #print(pred) #print(tag) #exit() metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()}) eval_result = metrics.get_metric() metric_name = metrics.__class__.__name__ eval_results[metric_name] = eval_result print("[tester] \n{}".format(_format_eval_results(eval_results)))
def test_sequential_batch(self): batch_size = 32 num_samples = 1000 dataset = generate_fake_dataset(num_samples) batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_x, batch_y in batch: pass
def test_list_of_numpy_to_tensor(self): ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] + [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: print(x, y)
def test_numpy_padding(self): ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) ds.set_input("x") ds.set_target("y") iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertEqual(x["x"].shape, (4, 4)) self.assertEqual(y["y"].shape, (4, 4))
def test_simple(self): dataset = construct_dataset( [["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)]) dataset.set_target() batch = DataSetIter(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True) cnt = 0 for _, _ in batch: cnt += 1 self.assertEqual(cnt, 10)
def test_list_to_tensor(self): ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10, "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10}) ds.set_input("x") ds.set_target("y") iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def test_dataset_batching(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.set_input("x") ds.set_target("y") iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray)) self.assertEqual(len(x["x"]), 4) self.assertEqual(len(y["y"]), 4) self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4]) self.assertListEqual(list(y["y"][-1]), [5, 6])
def dump_all_models_prob(config, models): dev_data = pickle.load( open(os.path.join(config.data_path, config.dev_name), "rb")) bert_dev_data = pickle.load( open(os.path.join(config.bert_data_path, config.dev_name), "rb")) data_iterator = Batch(dev_data, config.ensemble_batch, sampler=SequentialSampler(), as_numpy=False) bert_data_iterator = Batch(bert_dev_data, config.ensemble_batch, sampler=SequentialSampler(), as_numpy=False) for i, model in enumerate(models[:-1]): dump_one_model_prob(config.prob_path, config.ensemble_models[i], dev_data, model, data_iterator) dump_bert_model_prob(config.prob_path, config.ensemble_models[-1], bert_dev_data, models[-1], bert_data_iterator)
def test_list_of_list_to_tensor(self): ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] + [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def predict(self, data: DataSet, seq_len_field_name=None): r"""用已经训练好的模型进行inference. :param fastNLP.DataSet data: 待预测的数据集 :param str seq_len_field_name: 表示序列长度信息的field名字 :return: dict dict里面的内容为模型预测的结果 """ if not isinstance(data, DataSet): raise ValueError("Only Dataset class is allowed, not {}.".format( type(data))) if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays: raise ValueError("Field name {} not found in DataSet {}.".format( seq_len_field_name, data)) prev_training = self.network.training self.network.eval() network_device = _get_model_device(self.network) batch_output = defaultdict(list) data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) if hasattr(self.network, "predict"): predict_func = self.network.predict else: predict_func = self.network.forward with torch.no_grad(): for batch_x, _ in data_iterator: _move_dict_value_to_device(batch_x, _, device=network_device) refined_batch_x = _build_args(predict_func, **batch_x) prediction = predict_func(**refined_batch_x) if seq_len_field_name is not None: seq_lens = batch_x[seq_len_field_name].tolist() for key, value in prediction.items(): value = value.cpu().numpy() if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1): batch_output[key].extend(value.tolist()) else: if seq_len_field_name is not None: tmp_batch = [] for idx, seq_len in enumerate(seq_lens): tmp_batch.append(value[idx, :seq_len]) batch_output[key].extend(tmp_batch) else: batch_output[key].append(value) self.network.train(prev_training) return batch_output
def predict(model, dataset): model.eval() print(model_status(model.training)) num_correct = torch.tensor(0.0) num_sample = torch.tensor(0.0) for batch_x, batch_y in Batch(dataset, sampler=SequentialSampler(), batch_size=batch_size): x, lengths, y = pack(batch_x, batch_y) score = model(x, lengths) y_predict = torch.argmax(score, dim=1) num_correct += torch.sum(y_predict == y) num_sample += x.shape[0] return 1.0 * num_correct / num_sample
def predict(self, data: DataSet, seq_len_field_name=None): r""" """ if not isinstance(data, DataSet): raise ValueError( "Only Dataset class is allowed, not {}.".format(type(data))) if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays: raise ValueError("Field name {} not found in DataSet {}.".format( seq_len_field_name, data)) self.network.eval() # self.network.module for multi-GPU network_device = _get_model_device(self.network) batch_output = defaultdict(list) data_iterator = DataSetIter( data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) # predict_func = self.network.module.predict # self.network.module for # multi-GPU try: predict_func = self.network.predict except ModuleAttributeError: predict_func = self.network.module.predict with torch.no_grad(): # for batch_x, _ in tqdm(data_iterator): for batch_x, _ in tqdm(data_iterator, total=len(data_iterator)): _move_dict_value_to_device(batch_x, _, device=network_device) refined_batch_x = _build_args(predict_func, **batch_x) prediction = predict_func(**refined_batch_x) if seq_len_field_name is not None: seq_lens = batch_x[seq_len_field_name].tolist() for key, value in prediction.items(): value = value.cpu().numpy() if len(value.shape) == 1 or ( len(value.shape) == 2 and value.shape[1] == 1): batch_output[key].extend(value.tolist()) else: if seq_len_field_name is not None: tmp_batch = [] for idx, seq_len in enumerate(seq_lens): tmp_batch.append(value[idx, :seq_len]) batch_output[key].extend(tmp_batch) else: batch_output[key].append(value) return batch_output
def predict(config, model): tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb')) metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag') dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb")) char_vocab = pickle.load(open(os.path.join(config.data_path, config.char_vocab_name), "rb")) data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False) model.cuda() schema = get_schemas(config.source_path) eval_results = {} dev_data.set_input('tag') dev_data.set_target('seq_len') result = {} with torch.no_grad(): for i, (batch_x, _) in enumerate(data_iterator): print('batch', i) #if i > 10: # break char = batch_x['char'].cuda() word = batch_x['word'].cuda() pos = batch_x['pos'].cuda() spo = batch_x['spo'].cuda() seq_len = batch_x['seq_len'].cuda() #pred = model(char, word, pos, spo, seq_len, tag) pred = model.predict(char, word, pos, spo, seq_len) # labels? texts = char2text(char.cpu().data, char_vocab.idx2word) labels = idx2label(pred['pred'].cpu().data, tag_vocab.idx2word) spos = idx2spo(schema, spo.cpu().data) result = label2spo(labels, texts, result, spos) #print(pred) #print(tag) #exit() # metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()}) # eval_result = metrics.get_metric() # metric_name = metrics.__class__.__name__ # eval_results[metric_name] = eval_result return result
def get_answer(model, dataset): answer = [] print("start to generate result") model.eval() print(model_status(model.training)) for batch_x, batch_y in Batch(dataset, sampler=SequentialSampler(), batch_size=batch_size): x, lengths = pack(batch_x, batch_y, 0) score = model(x, lengths) y_predict = torch.argmax(score, dim=1).cpu().numpy() answer += list(y_predict) index = [a + 156061 for a in range(len(answer))] name = "result/CNN_pretrain" + str(use_pretrain) + "_freeze" + str( freeze_pretrain) + "dropouot" + str( dropout_rate) + "_batch_size" + str(batch_size) + "_lr" + str( learning_rate) + "_epoch" + str(num_epoch) + ".csv" dataframe = pd.DataFrame({'PhraseId': index, 'Sentiment': answer}) dataframe.to_csv(name, index=False, sep=',') return answer
para.requires_grad = False else: para.requires_grad = True print(name) optimizer = optm.NoamOpt( options.d_model, options.factor, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) optimizer._step = options.step best_model_file_name = "{}/model.bin".format(root_dir) train_sampler = BucketSampler(batch_size=options.batch_size, seq_len_field_name='seq_len') dev_sampler = SequentialSampler() i2t = utils.to_id_list(tag_vocab.word2idx) i2task = utils.to_id_list(task_vocab.word2idx) dev_set.set_input("ori_words") test_set.set_input("ori_words") word_dic = pickle.load(open("dict.pkl", "rb")) def tester(model, test_batch, write_out=False): res = [] prf = utils.CWSEvaluator(i2t) prf_dataset = {} oov_dataset = {}
with open("utils/squad_data.pkl", "rb") as f: dataset = pickle.load(f) dev_data = dataset.get_dev_data() word_vocab = dataset.word_vocab dev_context_word_field = dev_data.get_field('context_word') dev_context_word = dev_context_word_field.content model_path = "/remote-home/competition/Bidaf/fastNLP/reproduction/machine_reading_comprehension/tmp_1/best_BiDAF_f_1_2019-06-29-05-16-19" print("Loading model from {}".format(model_path)) dev_file = 'cache/data/dev-v1.1.json' model = torch.load(model_path) model.eval() evaluator = SquadEvaluator(dev_file) batch_size = 256 dev_iter = DataSetIter(dataset=dev_data,batch_size=batch_size,sampler=SequentialSampler()) results = [] processed_num = 0 for batch_x,batch_y in dev_iter: print("Batch shape:{}".format(batch_x['context_char'].shape)) ans = model(batch_x['context_char'],batch_x['context_word'],batch_x['context_word_len'], batch_x['question_char'],batch_x['question_word'],batch_x['question_word_len']) pred1,pred2 = get_best_answer(ans['start_logits'],ans['end_logits']) ans = [x for x in zip(pred1,pred2)] results += ans processed_num += batch_size print("Predicted {} records.".format(processed_num)) with open('./tmp_results.pkl','wb') as f: pickle.dump(results,f)
def test_sequential_sampler(self): sampler = SequentialSampler() data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10] for idx, i in enumerate(sampler(data)): assert idx == i
def predict(config, models, weight): test_data = pickle.load( open(os.path.join(config.data_path, config.test_name), "rb")) bert_test_data = pickle.load( open(os.path.join(config.bert_data_path, config.test_name), "rb")) data_iterator = Batch(test_data, config.predict_batch, sampler=SequentialSampler(), as_numpy=False) bert_data_iterator = Batch(bert_test_data, config.predict_batch, sampler=SequentialSampler(), as_numpy=False) for model in models: model.cuda() schema = get_schemas_list(config.source_path) weight = torch.tensor(weight).float() weight.cuda() weight_sum = torch.sum(weight) read_data = [] with open(os.path.join(config.source_path, config.test_source), 'rb') as f: for line in f: read_data.append(json.loads(line)) spo_list = [] with torch.no_grad(): for i, ((batch_x, _), (bert_batch_x, _)) in enumerate(zip(data_iterator, bert_data_iterator)): print('batch', i) #if i >= 5: # break # batch text = batch_x['text'].cuda() # target = batch_y['target'].cuda() # bert batch input_ids = bert_batch_x['input_ids'].cuda() token_type_ids = bert_batch_x['token_type_ids'].cuda() attention_mask = bert_batch_x['attention_mask'].cuda() # label_id = bert_batch_y['label_id'].cuda() # assert torch.equal(target, label_id) pred = models[-1](input_ids, token_type_ids, attention_mask) pred['output'] *= weight[-1] for i, model in enumerate(models[:-1]): pred['output'] += model(text)['output'] * weight[i] pred['output'] /= weight_sum for prob in pred['output']: spo_list.append(prob2spo(prob, schema)) with open(os.path.join(config.predict_path, config.predict_name), 'w') as f: for data, spo in zip(read_data, spo_list): data["spo_list"] = spo f.write(json.dumps(data, ensure_ascii=False) + '\n')
def test_collate_fn(self): batch_size = 32 num_samples = 1000 dataset = generate_fake_dataset(num_samples) dataset.set_input('1', '2') dataset.set_target('0', '3') fn = ConcatCollateFn(inputs=['1', '2'], output='12', pad_val=0, max_len=0, is_input=True, is_target=False) dataset.add_collate_fn(fn, name='demo') batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler(), drop_last=True) for batch_x, batch_y in batch: for i in range(batch_size): # print(i) self.assertEqual(batch_x['12'][i].sum(), batch_x['1'][i].sum() + batch_x['2'][i].sum()) dataset.delete_collate_fn(name='demo') # 测试非input的情况 dataset.set_input('1', '2', flag=False) # fn = ConcatCollateFn(inputs=['1', '2'], output='12', pad_val=0, max_len=0, is_input=True, is_target=False) dataset.add_collate_fn(fn, name='demo') batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler(), drop_last=True) for batch_x, batch_y in batch: for i in range(batch_size): self.assertTrue('12' in batch_x) dataset.delete_collate_fn(name='demo') dataset.set_input('1', '2', flag=True) # # 测试覆盖其它field的情况 fn = ConcatCollateFn(inputs=['1', '2'], output='3', pad_val=0, max_len=0, is_input=True, is_target=True) dataset.add_collate_fn(fn, name='demo') batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler(), drop_last=True) for batch_x, batch_y in batch: for i in range(batch_size): # print(i) self.assertEqual(batch_y['3'][i].sum(), batch_x['1'][i].sum() + batch_x['2'][i].sum()) dataset.delete_collate_fn(name='demo') # 测试非input,target的情况 dataset.set_input('1', '2', flag=False) fn = ConcatCollateFn(inputs=['1', '2'], output='3', pad_val=0, max_len=0, is_input=True, is_target=True) dataset.add_collate_fn(fn, name='demo') batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler(), drop_last=True) for batch_x, batch_y in batch: for i in range(batch_size): # print(i) self.assertTrue('3' in batch_x) self.assertTrue('3' in batch_y) dataset.delete_collate_fn(name='demo') # 测试加入非法fn的请 with self.assertRaises(AssertionError): dataset.add_collate_fn(1) # 测试collate_fn返回值只有一个的情况 def demo_collate_fn(ins_list): return {'3': 1} dataset.add_collate_fn(demo_collate_fn, name='demo') with self.assertRaises(BaseException): batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler(), drop_last=True) for batch_x, batch_y in batch: pass dataset.delete_collate_fn(name='demo') # 测试多个collate_fn dataset.add_collate_fn(demo_collate_fn, name='demo') dataset.add_collate_fn(demo_collate_fn, name='demo') # 测试删除 dataset.delete_collate_fn() dataset.delete_collate_fn() self.assertTrue(dataset.collater.is_empty())
def ensemble(config, models, sum_prob=False, weight=[1, 1, 1, 1, 1]): f1 = F1_score(pred='output', target='target') f1.tp.cuda() f1.fp.cuda() f1.fn.cuda() dev_data = pickle.load( open(os.path.join(config.data_path, config.dev_name), "rb")) bert_dev_data = pickle.load( open(os.path.join(config.bert_data_path, config.dev_name), "rb")) data_iterator = Batch(dev_data, config.ensemble_batch, sampler=SequentialSampler(), as_numpy=False) bert_data_iterator = Batch(bert_dev_data, config.ensemble_batch, sampler=SequentialSampler(), as_numpy=False) for model in models: model.cuda() eval_results = {} weight = torch.tensor(weight) weight.cuda() weight_sum = torch.sum(weight).float() with torch.no_grad(): for i, ((batch_x, batch_y), (bert_batch_x, bert_batch_y)) in enumerate( zip(data_iterator, bert_data_iterator)): print('batch', i) #if i > 10: # break # batch text = batch_x['text'].cuda() target = batch_y['target'].cuda() # bert batch input_ids = bert_batch_x['input_ids'].cuda() token_type_ids = bert_batch_x['token_type_ids'].cuda() attention_mask = bert_batch_x['attention_mask'].cuda() label_id = bert_batch_y['label_id'].cuda() #assert torch.equal(target, label_id) pred = models[-1](input_ids, token_type_ids, attention_mask) pred['output'] *= weight[-1] #if not sum_prob: # pred['output'][pred['output'] >= 0.5] = 1.0 * weight[-1] # pred['output'][pred['output'] < 0.5] = 0.0 # for i, model in enumerate(models[:-1]): # temp = model(text)['output'] # temp[temp >= 0.5] = 1.0 * weight[i] # temp[temp < 0.5] = 0.0 # pred['output'] += temp #else: for i, model in enumerate(models[:-1]): pred['output'] += model(text)['output'] * weight[i] pred['output'] /= weight_sum #bert_batch_y['label_id'].cuda() f1({'output': pred['output'].cuda()}, {'label_id': bert_batch_y['label_id'].cuda()}) eval_result = f1.get_metric() metric_name = f1.__class__.__name__ eval_results[metric_name] = eval_result print("[ensemble] \n{}".format(_format_eval_results(eval_results)))