Beispiel #1
0
    def test_demo(self):
        import torch

        data = DataSet({
            'x1': [[0, 1],
                   [2]],
            'x2': [[3],
                   [2, 4, 5]
                   ],
            'y': [0, 1]
        })
        data.set_target('y')

        # 所有的collect_fn函数都接受list[(ind1, instance1), (ind2, instance2), ...]作为输入,其中ind1/ind2是该instance在dataset中
        #   的index,instance1/instance2是这次batch取出来的数据,包含了所有的field.
        def concat_collect_fn(ins_list):
            x1 = [ins['x1'] for ind,ins in ins_list]
            x2 = [ins['x2'] for ind,ins in ins_list]
            xs = []
            for i in range(len(ins_list)):
                xs.append(torch.LongTensor(x1[i] + x2[i]))
            # 需要自行pad并转换为tensor,但不需要移动到gpu
            arr = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=0)
            b_x = {'x': arr}
            b_y = {}
            # 返回值一定是两个dict,第一个dict的值会认为是input,第二个dict的值会认为是target. 若名称与已有input或target重复,则
            #   采用返回值。
            return b_x, b_y

        data.add_collect_fn(concat_collect_fn)

        for batch_x, batch_y in DataSetIter(data, sampler=SequentialSampler(), batch_size=2):
            print("batch_x:", batch_x)
            print("batch_y:", batch_y)
            # batch_x: {'x': tensor([[0, 1, 3, 0],
            #                        [2, 2, 4, 5]])}
            # batch_y: {'y': array([0, 1])}

        # 如果取batch过程含有一些参数,可以通过类来实现
        class ConCollectFn:
            def __init__(self, max_len=3):
                self.max_len = max_len
            def __call__(self, ins_list):
                x1 = [ins['x1'] for ind, ins in ins_list]
                x2 = [ins['x2'] for ind, ins in ins_list]
                xs = []
                for i in range(len(ins_list)):
                    xs.append(torch.LongTensor(x1[i] + x2[i])[:self.max_len])
                arr = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=0)
                b_x = {'x': arr}
                b_y = {}
                return b_x, b_y
        data.delete_collect_fn()  # 删除之前的collect_fn
        data.add_collect_fn(ConCollectFn(max_len=3))
        for batch_x, batch_y in DataSetIter(data, sampler=SequentialSampler(), batch_size=2):
            print("batch_x:", batch_x)
            print("batch_y:", batch_y)
Beispiel #2
0
    def test_udf_padder(self):
        from fastNLP.core.field import Padder
        alphas = list('abcdefghijk')

        class UDFPadder(Padder):
            def __init__(self):
                super().__init__()

            def __call__(self, contents, field_name, field_ele_dtype, dim):
                results = [alphas[:con] for con in contents]
                return results

        batch_size = 32
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)
        contents = np.random.randint(5, size=(num_samples))
        dataset.add_field('test',
                          contents,
                          is_input=True,
                          padder=UDFPadder(),
                          ignore_type=True)

        batch = DataSetIter(dataset,
                            batch_size=batch_size,
                            sampler=SequentialSampler())
        for batch_x, batch_y in batch:
            test = batch_x['test']
            indices = batch.cur_batch_indices
            cons = contents[indices]
            for con, t in zip(cons, test):
                self.assertEqual(alphas[:con], t)
def get_predictions(pred_model, input_data, batch_size, num_workers=4):
    texts = list(list(map(lambda x: vocabs['char'].to_word(x), sample['chars'])) for sample in input_data)
    seq_lens = [sample['seq_len'] for sample in input_data]
    pred_model.to(device)
    sampler = SequentialSampler()
    data_iterator = DataSetIter(dataset=input_data, batch_size=batch_size, sampler=sampler,
                                num_workers=num_workers)
    with torch.no_grad():
        preds, golds = [], []
        pred_model.eval()

        for batch_x, batch_y in data_iterator:
            _move_dict_value_to_device(batch_x, batch_y, device=device)
            x = _build_args(pred_model.forward, **batch_x)
            with torch.no_grad():
                y = pred_model.forward(**x)
            preds.extend(list(map(list, y['pred'].cpu().numpy())))
            golds.extend(list(map(list, batch_y['target'].cpu().numpy())))
    pred_seqs = list(list(map(lambda _y: vocabs['label'].to_word(_y), pred)) for pred in preds)
    gold_seqs = list(list(map(lambda _y: vocabs['label'].to_word(_y), pred)) for pred in golds)
    case_result = []
    for pred_seq, gold_seq, word_seq, seq_len in zip(pred_seqs, gold_seqs, texts, seq_lens):
        pred_seq = pred_seq[:seq_len]
        gold_seq = gold_seq[:seq_len]
        case_result.append((''.join(word_seq), extract_kvpairs_in_bmoes(gold_seq, word_seq),
                           extract_kvpairs_in_bmoes(pred_seq, word_seq)))

    # output for case study
    os.makedirs(f'../output/case_study/{args.dataset}', exist_ok=True)
    fout = open(f'../output/case_study/{args.dataset}/{args.dataset}_bert{args.use_bert}_scheme{args.new_tag_scheme}_ple{args.ple_channel_num}_plstm{int(args.use_ple_lstm)}_trainrate{args.train_dataset_rate}.casestudy', 'w', encoding='utf8')
    for word_seq, gold_pair, pred_pair in case_result:
        fout.write(word_seq + '\n' + str(gold_pair) + '\n' + str(pred_pair) + '\n\n')
Beispiel #4
0
    def test_collect_fn(self):
        batch_size = 32
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)
        dataset.set_input('1', '2')
        dataset.set_target('0', '3')

        fn = ConcatCollectFn()
        dataset.add_collect_fn(fn,
                               inputs=['1', '2'],
                               outputs=['12', 'seq_len'],
                               is_input=True,
                               is_target=False)

        batch = DataSetIter(dataset,
                            batch_size=batch_size,
                            sampler=SequentialSampler(),
                            drop_last=True)
        for batch_x, batch_y in batch:
            for i in range(batch_size):
                # print(i)
                self.assertEqual(batch_x['12'][i].sum(),
                                 batch_x['1'][i].sum() + batch_x['2'][i].sum())
                self.assertEqual(batch_x['seq_len'][i],
                                 (batch_x['1'][i] != 0).sum() +
                                 (batch_x['2'][i] != 0).sum())
Beispiel #5
0
def dump_model_result(config, model):
    tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb'))
    metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag')
    dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb"))
    data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False)
    model.cuda()

    eval_results = {}
    dev_data.set_input('tag')
    dev_data.set_target('seq_len')
    with torch.no_grad():
        for i, (batch_x, batch_y) in enumerate(data_iterator):
            print('batch', i)
            #if i > 10:
            #    break
            char = batch_x['char'].cuda()
            word = batch_x['word'].cuda()
            pos = batch_x['pos'].cuda()
            spo = batch_x['spo'].cuda()
            seq_len = batch_x['seq_len'].cuda()

            tag = batch_y['tag'].cuda()
            
            #pred = model(char, word, pos, spo, seq_len, tag)
            pred = model.predict(char, word, pos, spo, seq_len)  # labels?
            #labels = idx2label(pred['pred'], tag_vocab.idx2word)
            #print(pred)
            #print(tag)
            #exit()
            metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()})
        eval_result = metrics.get_metric()
        metric_name = metrics.__class__.__name__
        eval_results[metric_name] = eval_result

    print("[tester] \n{}".format(_format_eval_results(eval_results)))
Beispiel #6
0
 def test_sequential_batch(self):
     batch_size = 32
     num_samples = 1000
     dataset = generate_fake_dataset(num_samples)
     
     batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler())
     for batch_x, batch_y in batch:
         pass
Beispiel #7
0
 def test_list_of_numpy_to_tensor(self):
     ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
                  [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         print(x, y)
Beispiel #8
0
 def test_numpy_padding(self):
     ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                   "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
     ds.set_input("x")
     ds.set_target("y")
     iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertEqual(x["x"].shape, (4, 4))
         self.assertEqual(y["y"].shape, (4, 4))
Beispiel #9
0
 def test_simple(self):
     dataset = construct_dataset(
         [["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)])
     dataset.set_target()
     batch = DataSetIter(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     
     cnt = 0
     for _, _ in batch:
         cnt += 1
     self.assertEqual(cnt, 10)
Beispiel #10
0
 def test_list_to_tensor(self):
     ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
                   "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
     ds.set_input("x")
     ds.set_target("y")
     iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
Beispiel #11
0
 def test_dataset_batching(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
     ds.set_input("x")
     ds.set_target("y")
     iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray))
         self.assertEqual(len(x["x"]), 4)
         self.assertEqual(len(y["y"]), 4)
         self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
         self.assertListEqual(list(y["y"][-1]), [5, 6])
Beispiel #12
0
def dump_all_models_prob(config, models):
    dev_data = pickle.load(
        open(os.path.join(config.data_path, config.dev_name), "rb"))
    bert_dev_data = pickle.load(
        open(os.path.join(config.bert_data_path, config.dev_name), "rb"))

    data_iterator = Batch(dev_data,
                          config.ensemble_batch,
                          sampler=SequentialSampler(),
                          as_numpy=False)
    bert_data_iterator = Batch(bert_dev_data,
                               config.ensemble_batch,
                               sampler=SequentialSampler(),
                               as_numpy=False)

    for i, model in enumerate(models[:-1]):
        dump_one_model_prob(config.prob_path, config.ensemble_models[i],
                            dev_data, model, data_iterator)
    dump_bert_model_prob(config.prob_path, config.ensemble_models[-1],
                         bert_dev_data, models[-1], bert_data_iterator)
Beispiel #13
0
 def test_list_of_list_to_tensor(self):
     ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
                  [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
Beispiel #14
0
    def predict(self, data: DataSet, seq_len_field_name=None):
        r"""用已经训练好的模型进行inference.

        :param fastNLP.DataSet data: 待预测的数据集
        :param str seq_len_field_name: 表示序列长度信息的field名字
        :return: dict dict里面的内容为模型预测的结果
        """
        if not isinstance(data, DataSet):
            raise ValueError("Only Dataset class is allowed, not {}.".format(
                type(data)))
        if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
            raise ValueError("Field name {} not found in DataSet {}.".format(
                seq_len_field_name, data))

        prev_training = self.network.training
        self.network.eval()
        network_device = _get_model_device(self.network)
        batch_output = defaultdict(list)
        data_iterator = DataSetIter(data,
                                    batch_size=self.batch_size,
                                    sampler=SequentialSampler(),
                                    as_numpy=False)

        if hasattr(self.network, "predict"):
            predict_func = self.network.predict
        else:
            predict_func = self.network.forward

        with torch.no_grad():
            for batch_x, _ in data_iterator:
                _move_dict_value_to_device(batch_x, _, device=network_device)
                refined_batch_x = _build_args(predict_func, **batch_x)
                prediction = predict_func(**refined_batch_x)

                if seq_len_field_name is not None:
                    seq_lens = batch_x[seq_len_field_name].tolist()

                for key, value in prediction.items():
                    value = value.cpu().numpy()
                    if len(value.shape) == 1 or (len(value.shape) == 2
                                                 and value.shape[1] == 1):
                        batch_output[key].extend(value.tolist())
                    else:
                        if seq_len_field_name is not None:
                            tmp_batch = []
                            for idx, seq_len in enumerate(seq_lens):
                                tmp_batch.append(value[idx, :seq_len])
                            batch_output[key].extend(tmp_batch)
                        else:
                            batch_output[key].append(value)

        self.network.train(prev_training)
        return batch_output
Beispiel #15
0
def predict(model, dataset):
    model.eval()
    print(model_status(model.training))
    num_correct = torch.tensor(0.0)
    num_sample = torch.tensor(0.0)
    for batch_x, batch_y in Batch(dataset,
                                  sampler=SequentialSampler(),
                                  batch_size=batch_size):
        x, lengths, y = pack(batch_x, batch_y)
        score = model(x, lengths)
        y_predict = torch.argmax(score, dim=1)
        num_correct += torch.sum(y_predict == y)
        num_sample += x.shape[0]
    return 1.0 * num_correct / num_sample
Beispiel #16
0
    def predict(self, data: DataSet, seq_len_field_name=None):
        r"""
        """
        if not isinstance(data, DataSet):
            raise ValueError(
                "Only Dataset class is allowed, not {}.".format(type(data)))
        if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
            raise ValueError("Field name {} not found in DataSet {}.".format(
                seq_len_field_name, data))

        self.network.eval()  # self.network.module for multi-GPU
        network_device = _get_model_device(self.network)
        batch_output = defaultdict(list)
        data_iterator = DataSetIter(
            data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)

        # predict_func = self.network.module.predict  # self.network.module for
        # multi-GPU
        try:
            predict_func = self.network.predict
        except ModuleAttributeError:
            predict_func = self.network.module.predict

        with torch.no_grad():
            #            for batch_x, _ in tqdm(data_iterator):
            for batch_x, _ in tqdm(data_iterator, total=len(data_iterator)):
                _move_dict_value_to_device(batch_x, _, device=network_device)
                refined_batch_x = _build_args(predict_func, **batch_x)
                prediction = predict_func(**refined_batch_x)
                if seq_len_field_name is not None:
                    seq_lens = batch_x[seq_len_field_name].tolist()

                for key, value in prediction.items():
                    value = value.cpu().numpy()
                    if len(value.shape) == 1 or (
                            len(value.shape) == 2 and value.shape[1] == 1):
                        batch_output[key].extend(value.tolist())
                    else:
                        if seq_len_field_name is not None:
                            tmp_batch = []
                            for idx, seq_len in enumerate(seq_lens):
                                tmp_batch.append(value[idx, :seq_len])
                            batch_output[key].extend(tmp_batch)
                        else:
                            batch_output[key].append(value)
        return batch_output
def predict(config, model):
    tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb'))
    metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag')
    dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb"))
    char_vocab = pickle.load(open(os.path.join(config.data_path, config.char_vocab_name), "rb"))

    data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False)
    model.cuda()

    schema = get_schemas(config.source_path)

    eval_results = {}
    dev_data.set_input('tag')
    dev_data.set_target('seq_len')
    result = {}
    with torch.no_grad():
        for i, (batch_x, _) in enumerate(data_iterator):
            print('batch', i)
            #if i > 10:
            #    break
            char = batch_x['char'].cuda()
            word = batch_x['word'].cuda()
            pos = batch_x['pos'].cuda()
            spo = batch_x['spo'].cuda()
            seq_len = batch_x['seq_len'].cuda()
            
            #pred = model(char, word, pos, spo, seq_len, tag)
            pred = model.predict(char, word, pos, spo, seq_len)  # labels?

            texts = char2text(char.cpu().data, char_vocab.idx2word)
            labels = idx2label(pred['pred'].cpu().data, tag_vocab.idx2word)
            spos = idx2spo(schema, spo.cpu().data)
            result = label2spo(labels, texts, result, spos)
            #print(pred)
            #print(tag)
            #exit()
            # metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()})
        # eval_result = metrics.get_metric()
        # metric_name = metrics.__class__.__name__
        # eval_results[metric_name] = eval_result

    return result
Beispiel #18
0
def get_answer(model, dataset):
    answer = []
    print("start to generate result")
    model.eval()
    print(model_status(model.training))
    for batch_x, batch_y in Batch(dataset,
                                  sampler=SequentialSampler(),
                                  batch_size=batch_size):
        x, lengths = pack(batch_x, batch_y, 0)
        score = model(x, lengths)
        y_predict = torch.argmax(score, dim=1).cpu().numpy()
        answer += list(y_predict)
    index = [a + 156061 for a in range(len(answer))]
    name = "result/CNN_pretrain" + str(use_pretrain) + "_freeze" + str(
        freeze_pretrain) + "dropouot" + str(
            dropout_rate) + "_batch_size" + str(batch_size) + "_lr" + str(
                learning_rate) + "_epoch" + str(num_epoch) + ".csv"
    dataframe = pd.DataFrame({'PhraseId': index, 'Sentiment': answer})
    dataframe.to_csv(name, index=False, sep=',')
    return answer
Beispiel #19
0
            para.requires_grad = False
        else:
            para.requires_grad = True
            print(name)

optimizer = optm.NoamOpt(
    options.d_model, options.factor, 4000,
    torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

optimizer._step = options.step

best_model_file_name = "{}/model.bin".format(root_dir)

train_sampler = BucketSampler(batch_size=options.batch_size,
                              seq_len_field_name='seq_len')
dev_sampler = SequentialSampler()

i2t = utils.to_id_list(tag_vocab.word2idx)
i2task = utils.to_id_list(task_vocab.word2idx)
dev_set.set_input("ori_words")
test_set.set_input("ori_words")

word_dic = pickle.load(open("dict.pkl", "rb"))


def tester(model, test_batch, write_out=False):
    res = []
    prf = utils.CWSEvaluator(i2t)
    prf_dataset = {}
    oov_dataset = {}
Beispiel #20
0
with open("utils/squad_data.pkl", "rb") as f:
    dataset = pickle.load(f)

dev_data = dataset.get_dev_data()
word_vocab = dataset.word_vocab
dev_context_word_field = dev_data.get_field('context_word')
dev_context_word = dev_context_word_field.content

model_path = "/remote-home/competition/Bidaf/fastNLP/reproduction/machine_reading_comprehension/tmp_1/best_BiDAF_f_1_2019-06-29-05-16-19"
print("Loading model from {}".format(model_path))
dev_file = 'cache/data/dev-v1.1.json'
model = torch.load(model_path)
model.eval()
evaluator = SquadEvaluator(dev_file)
batch_size = 256
dev_iter = DataSetIter(dataset=dev_data,batch_size=batch_size,sampler=SequentialSampler())
results = []
processed_num = 0
for batch_x,batch_y in dev_iter:
    print("Batch shape:{}".format(batch_x['context_char'].shape))
    ans = model(batch_x['context_char'],batch_x['context_word'],batch_x['context_word_len'],
                batch_x['question_char'],batch_x['question_word'],batch_x['question_word_len'])

    pred1,pred2 = get_best_answer(ans['start_logits'],ans['end_logits'])
    ans = [x for x in zip(pred1,pred2)]
    results += ans
    processed_num += batch_size
    print("Predicted {} records.".format(processed_num))

with open('./tmp_results.pkl','wb') as f:
    pickle.dump(results,f)
Beispiel #21
0
 def test_sequential_sampler(self):
     sampler = SequentialSampler()
     data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10]
     for idx, i in enumerate(sampler(data)):
         assert idx == i
def predict(config, models, weight):
    test_data = pickle.load(
        open(os.path.join(config.data_path, config.test_name), "rb"))
    bert_test_data = pickle.load(
        open(os.path.join(config.bert_data_path, config.test_name), "rb"))

    data_iterator = Batch(test_data,
                          config.predict_batch,
                          sampler=SequentialSampler(),
                          as_numpy=False)
    bert_data_iterator = Batch(bert_test_data,
                               config.predict_batch,
                               sampler=SequentialSampler(),
                               as_numpy=False)

    for model in models:
        model.cuda()

    schema = get_schemas_list(config.source_path)
    weight = torch.tensor(weight).float()
    weight.cuda()
    weight_sum = torch.sum(weight)

    read_data = []
    with open(os.path.join(config.source_path, config.test_source), 'rb') as f:
        for line in f:
            read_data.append(json.loads(line))

    spo_list = []
    with torch.no_grad():
        for i, ((batch_x, _),
                (bert_batch_x,
                 _)) in enumerate(zip(data_iterator, bert_data_iterator)):
            print('batch', i)
            #if i >= 5:
            #    break
            # batch
            text = batch_x['text'].cuda()
            # target = batch_y['target'].cuda()
            # bert batch
            input_ids = bert_batch_x['input_ids'].cuda()
            token_type_ids = bert_batch_x['token_type_ids'].cuda()
            attention_mask = bert_batch_x['attention_mask'].cuda()
            # label_id = bert_batch_y['label_id'].cuda()

            # assert torch.equal(target, label_id)

            pred = models[-1](input_ids, token_type_ids, attention_mask)
            pred['output'] *= weight[-1]
            for i, model in enumerate(models[:-1]):
                pred['output'] += model(text)['output'] * weight[i]
            pred['output'] /= weight_sum

            for prob in pred['output']:
                spo_list.append(prob2spo(prob, schema))

    with open(os.path.join(config.predict_path, config.predict_name),
              'w') as f:
        for data, spo in zip(read_data, spo_list):
            data["spo_list"] = spo
            f.write(json.dumps(data, ensure_ascii=False) + '\n')
Beispiel #23
0
    def test_collate_fn(self):
        batch_size = 32
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)
        dataset.set_input('1', '2')
        dataset.set_target('0', '3')

        fn = ConcatCollateFn(inputs=['1', '2'],
                             output='12',
                             pad_val=0,
                             max_len=0,
                             is_input=True,
                             is_target=False)
        dataset.add_collate_fn(fn, name='demo')
        batch = DataSetIter(dataset,
                            batch_size=batch_size,
                            sampler=SequentialSampler(),
                            drop_last=True)
        for batch_x, batch_y in batch:
            for i in range(batch_size):
                # print(i)
                self.assertEqual(batch_x['12'][i].sum(),
                                 batch_x['1'][i].sum() + batch_x['2'][i].sum())
        dataset.delete_collate_fn(name='demo')

        # 测试非input的情况
        dataset.set_input('1', '2', flag=False)  #
        fn = ConcatCollateFn(inputs=['1', '2'],
                             output='12',
                             pad_val=0,
                             max_len=0,
                             is_input=True,
                             is_target=False)
        dataset.add_collate_fn(fn, name='demo')
        batch = DataSetIter(dataset,
                            batch_size=batch_size,
                            sampler=SequentialSampler(),
                            drop_last=True)
        for batch_x, batch_y in batch:
            for i in range(batch_size):
                self.assertTrue('12' in batch_x)
        dataset.delete_collate_fn(name='demo')
        dataset.set_input('1', '2', flag=True)  #

        # 测试覆盖其它field的情况
        fn = ConcatCollateFn(inputs=['1', '2'],
                             output='3',
                             pad_val=0,
                             max_len=0,
                             is_input=True,
                             is_target=True)
        dataset.add_collate_fn(fn, name='demo')
        batch = DataSetIter(dataset,
                            batch_size=batch_size,
                            sampler=SequentialSampler(),
                            drop_last=True)
        for batch_x, batch_y in batch:
            for i in range(batch_size):
                # print(i)
                self.assertEqual(batch_y['3'][i].sum(),
                                 batch_x['1'][i].sum() + batch_x['2'][i].sum())
        dataset.delete_collate_fn(name='demo')

        # 测试非input,target的情况
        dataset.set_input('1', '2', flag=False)
        fn = ConcatCollateFn(inputs=['1', '2'],
                             output='3',
                             pad_val=0,
                             max_len=0,
                             is_input=True,
                             is_target=True)
        dataset.add_collate_fn(fn, name='demo')
        batch = DataSetIter(dataset,
                            batch_size=batch_size,
                            sampler=SequentialSampler(),
                            drop_last=True)
        for batch_x, batch_y in batch:
            for i in range(batch_size):
                # print(i)
                self.assertTrue('3' in batch_x)
                self.assertTrue('3' in batch_y)
        dataset.delete_collate_fn(name='demo')

        # 测试加入非法fn的请
        with self.assertRaises(AssertionError):
            dataset.add_collate_fn(1)

        # 测试collate_fn返回值只有一个的情况
        def demo_collate_fn(ins_list):
            return {'3': 1}

        dataset.add_collate_fn(demo_collate_fn, name='demo')
        with self.assertRaises(BaseException):
            batch = DataSetIter(dataset,
                                batch_size=batch_size,
                                sampler=SequentialSampler(),
                                drop_last=True)
            for batch_x, batch_y in batch:
                pass
        dataset.delete_collate_fn(name='demo')

        # 测试多个collate_fn
        dataset.add_collate_fn(demo_collate_fn, name='demo')
        dataset.add_collate_fn(demo_collate_fn, name='demo')
        # 测试删除
        dataset.delete_collate_fn()
        dataset.delete_collate_fn()
        self.assertTrue(dataset.collater.is_empty())
Beispiel #24
0
def ensemble(config, models, sum_prob=False, weight=[1, 1, 1, 1, 1]):
    f1 = F1_score(pred='output', target='target')
    f1.tp.cuda()
    f1.fp.cuda()
    f1.fn.cuda()

    dev_data = pickle.load(
        open(os.path.join(config.data_path, config.dev_name), "rb"))
    bert_dev_data = pickle.load(
        open(os.path.join(config.bert_data_path, config.dev_name), "rb"))

    data_iterator = Batch(dev_data,
                          config.ensemble_batch,
                          sampler=SequentialSampler(),
                          as_numpy=False)
    bert_data_iterator = Batch(bert_dev_data,
                               config.ensemble_batch,
                               sampler=SequentialSampler(),
                               as_numpy=False)

    for model in models:
        model.cuda()

    eval_results = {}
    weight = torch.tensor(weight)
    weight.cuda()
    weight_sum = torch.sum(weight).float()
    with torch.no_grad():
        for i, ((batch_x, batch_y), (bert_batch_x, bert_batch_y)) in enumerate(
                zip(data_iterator, bert_data_iterator)):
            print('batch', i)
            #if i > 10:
            #    break
            # batch
            text = batch_x['text'].cuda()
            target = batch_y['target'].cuda()
            # bert batch
            input_ids = bert_batch_x['input_ids'].cuda()
            token_type_ids = bert_batch_x['token_type_ids'].cuda()
            attention_mask = bert_batch_x['attention_mask'].cuda()
            label_id = bert_batch_y['label_id'].cuda()

            #assert torch.equal(target, label_id)

            pred = models[-1](input_ids, token_type_ids, attention_mask)
            pred['output'] *= weight[-1]
            #if not sum_prob:
            #    pred['output'][pred['output'] >= 0.5] = 1.0 * weight[-1]
            #    pred['output'][pred['output'] < 0.5] = 0.0
            #    for i, model in enumerate(models[:-1]):
            #        temp = model(text)['output']
            #        temp[temp >= 0.5] = 1.0 * weight[i]
            #        temp[temp < 0.5] = 0.0
            #        pred['output'] += temp
            #else:
            for i, model in enumerate(models[:-1]):
                pred['output'] += model(text)['output'] * weight[i]
            pred['output'] /= weight_sum

            #bert_batch_y['label_id'].cuda()
            f1({'output': pred['output'].cuda()},
               {'label_id': bert_batch_y['label_id'].cuda()})
        eval_result = f1.get_metric()
        metric_name = f1.__class__.__name__
        eval_results[metric_name] = eval_result

    print("[ensemble] \n{}".format(_format_eval_results(eval_results)))