Ejemplo n.º 1
0
    def test_submission(self):
        data = DATADOUBAN(debug=False, data_dir=self.data_dir)
        test_examples = data.read_examples_test(
            os.path.join(self.data_dir, 'test.csv'))
        print('eval_examples的数量', len(test_examples))
        prediction = np.zeros((len(test_examples), self.num_labels))
        gold_labels_ = np.zeros((len(test_examples), self.num_labels))
        logits_ = np.zeros((len(test_examples), self.num_labels))
        questions = [x.text_a for x in test_examples]
        test_features = data.convert_examples_to_features(
            test_examples, self.tokenizer, self.max_seq_length)
        all_input_ids = torch.tensor(data.select_field(test_features,
                                                       'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(data.select_field(
            test_features, 'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(data.select_field(
            test_features, 'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in test_features],
                                 dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=self.eval_batch_size)

        for i in range(5):

            config = BertConfig.from_pretrained(self.model_name_or_path,
                                                num_labels=self.num_labels)
            model = BertForSequenceClassification.from_pretrained(
                os.path.join(self.output_dir,
                             "pytorch_model_{}.bin".format(i)),
                self.args,
                config=config)
            model.to(self.device)
            model.eval()

            inference_labels = []
            gold_labels = []

            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(self.device)
                input_mask = input_mask.to(self.device)
                segment_ids = segment_ids.to(self.device)
                label_ids = label_ids.to(self.device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            gold_labels_ = gold_labels
            logits = np.concatenate(inference_labels, 0)
            print(logits.shape)
            print(prediction.shape)

            prediction += logits / 5

        test_id = [x.guid for x in test_examples]
        assert len(test_id) == len(prediction)
        # print(accuracyCQA(prediction, gold_labels_))
        # print(compute_MRR_CQA(questions))
        logits_ = np.argmax(prediction, axis=1)

        submission = pd.DataFrame({'id': test_id, 'predict': logits_})
        submission.to_csv(os.path.join(self.output_dir, "sub.csv"),
                          index=False,
                          header=False)
Ejemplo n.º 2
0
    def create_dataloader(self):
        data = DATADOUBAN(
            debug=False,
            data_dir=self.data_dir,
        )
        train_examples = data.read_examples(
            os.path.join(self.data_dir, 'train.csv'))
        train_features = data.convert_examples_to_features(
            train_examples, self.tokenizer, self.max_seq_length)
        all_input_ids = torch.tensor(data.select_field(train_features,
                                                       'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(data.select_field(
            train_features, 'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(data.select_field(
            train_features, 'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)

        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=self.train_batch_size)
        '''
        eval_examples = data.read_examples(examples_[1])
        eval_features = data.convert_examples_to_features(eval_examples, self.tokenizer, self.max_seq_length)
        all_input_ids = torch.tensor(data.select_field(eval_features, 'input_ids'), dtype=torch.long)
        all_input_mask = torch.tensor(data.select_field(eval_features, 'input_mask'), dtype=torch.long)
        all_segment_ids = torch.tensor(data.select_field(eval_features, 'segment_ids'), dtype=torch.long)
        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size)
        '''

        eval_examples = data.read_examples(
            os.path.join(self.data_dir, 'dev.csv'))
        eval_features = data.convert_examples_to_features(
            eval_examples, self.tokenizer, self.max_seq_length)
        all_input_ids = torch.tensor(data.select_field(eval_features,
                                                       'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(data.select_field(
            eval_features, 'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(data.select_field(
            eval_features, 'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in eval_features],
                                 dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=self.eval_batch_size)

        return train_dataloader, eval_dataloader, train_examples, eval_examples
Ejemplo n.º 3
0
    def test_eval(self):
        data = DATADOUBAN(debug=False, data_dir=self.data_dir)
        test_examples = data.read_examples(
            os.path.join(self.data_dir, 'test.csv'))
        print('eval_examples的数量', len(test_examples))

        questions = [x.text_a for x in test_examples]
        ID = [x.guid for x in test_examples]

        test_features = data.convert_examples_to_features(
            test_examples, self.tokenizer, self.max_seq_length)
        all_input_ids = torch.tensor(data.select_field(test_features,
                                                       'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(data.select_field(
            test_features, 'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(data.select_field(
            test_features, 'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in test_features],
                                 dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label)
        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=self.eval_batch_size)

        config = BertConfig.from_pretrained(self.model_name_or_path,
                                            num_labels=self.num_labels)
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            self.output_dir, "pytorch_model.bin"),
                                                              self.args,
                                                              config=config)
        model.to(self.device)
        model.eval()

        inference_labels = []
        gold_labels = []
        scores = []

        for input_ids, input_mask, segment_ids, label_ids in test_dataloader:
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)
            label_ids = label_ids.to(self.device)

            with torch.no_grad():
                logits = model(
                    input_ids=input_ids,
                    token_type_ids=segment_ids,
                    attention_mask=input_mask).detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            scores.append(logits)
            inference_labels.append(np.argmax(logits, axis=1))
            gold_labels.append(label_ids)
        gold_labels = np.concatenate(gold_labels, 0)
        scores = np.concatenate(scores, 0)
        logits = np.concatenate(inference_labels, 0)

        # 计算评价指标
        assert len(ID) == scores.shape[0] == scores.shape[0]
        # eval_accuracy = accuracyCQA(inference_logits, gold_labels)
        # eval_mrr = compute_MRR_CQA(scores, gold_labels, questions)
        eval_5R20 = compute_5R20(scores, gold_labels, questions)
        eval_DOUBAN_MRR, eval_DOUBAN_mrr, eval_DOUBAN_MAP, eval_Precision1 = compute_DOUBAN(
            ID, scores, gold_labels)
        # print('eval_mrr',eval_mrr)
        print('eval_MRR', eval_DOUBAN_MRR, eval_DOUBAN_mrr, 'eval_MAP',
              eval_DOUBAN_MAP, 'eval_Precision1', eval_Precision1)
        print('eval_5R20', eval_5R20)