Example #1
0
    def __init__(self, test_query, test_reply):
        self.tokenizer = Tokenizer4Bert(opt.max_length,
                                        opt.pretrained_bert_name)
        bert_model = BertModel.from_pretrained(opt.pretrained_bert_name,
                                               output_hidden_states=True)
        self.model = opt.model_class(bert_model, opt).to(opt.device)

        # * testset
        df_test_query = pd.read_csv(test_query,
                                    sep='\t',
                                    header=None,
                                    encoding='utf-8',
                                    engine='python')
        df_test_query.columns = ['id', 'q1']
        df_test_reply = pd.read_csv(test_reply,
                                    sep='\t',
                                    header=None,
                                    encoding='utf-8',
                                    engine='python')
        df_test_reply.columns = ['id', 'id_sub', 'q2']
        df_test_reply['q2'] = df_test_reply['q2'].fillna('好的')
        df_test_data = df_test_query.merge(df_test_reply, how='left')
        if opt.add_pseudo_data:
            self.pseudo_groups = df_test_data.loc[:, 'id'].to_numpy()
            self.pseudo_index = np.array(df_test_data.index)
            self.pseudo_data = copy.deepcopy(df_test_data)
        self.submit = copy.deepcopy(df_test_reply)
        # self.pseudo = copy.deepcopy(df_test_data)
        testset = BertSentenceDataset(df_test_data, self.tokenizer, test=True)

        if opt.dialogue:
            self.test_dataloader = DataLoader(dataset=testset,
                                              batch_size=opt.eval_batch_size,
                                              shuffle=False,
                                              collate_fn=collate_wrapper)
        else:
            self.test_dataloader = DataLoader(dataset=testset,
                                              batch_size=opt.eval_batch_size,
                                              shuffle=False)

        if opt.datareverse:
            df_test_data_reverse = copy.deepcopy(
                df_test_data[['id', 'q2', 'id_sub', 'q1']])
            testset_reverse = BertSentenceDataset(df_test_data_reverse,
                                                  self.tokenizer,
                                                  test=True)
            self.test_dataloader_reverse = DataLoader(
                dataset=testset_reverse,
                batch_size=opt.eval_batch_size,
                shuffle=False)

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(opt.device.index)))
        self._print_args()
Example #2
0
    def __init__(self, opt):
        self.opt = opt
        tokenizer = Tokenizer4Bert(opt.max_length, opt.pretrained_bert_name)
        bert_model = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True)
        self.pretrained_bert_state_dict = bert_model.state_dict()
        self.model = opt.model_class(bert_model, opt).to(opt.device)

        print('loading model {0} ...'.format(opt.model_name))
        
        self.model.load_state_dict(torch.load(opt.state_dict_path))
        self.model = self.model.to(opt.device)
        
        torch.autograd.set_grad_enabled(False)

        testset = BertSentenceDataset(opt.dataset_file['test'], tokenizer, target_dim=self.opt.polarities_dim, opt=opt)
        self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False)
Example #3
0
    def __init__(self, opt):
        self.opt = opt
        self.tokenizer = build_tokenizer(
            fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
            max_length=opt.max_length,
            data_file='./embedding/{0}_{1}_tokenizer.dat'.format(
                opt.model_name, opt.dataset),
        )
        embedding_matrix = build_embedding_matrix(
            vocab=self.tokenizer.vocab,
            embed_dim=opt.embed_dim,
            data_file='./embedding/{0}_{1}d_{2}_embedding_matrix.dat'.format(
                opt.model_name, str(opt.embed_dim), opt.dataset))

        self.model = opt.model_class(embedding_matrix, opt)
        print('loading model {0} ...'.format(opt.model_name))
        self.model.load_state_dict(torch.load(opt.state_dict_path))
        self.model = self.model.to(opt.device)

        torch.autograd.set_grad_enabled(False)
    def __init__(self, opt):
        self.opt = opt
        tokenizer = Tokenizer4Bert(opt.max_length, opt.pretrained_bert_name)
        # bert_model = AutoModel.from_pretrained(opt.pretrained_bert_name)
        bert_model = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True)
        # bert_model = AlbertModel.from_pretrained(opt.pretrained_bert_name)
        # self.pretrained_bert_state_dict = bert_model.state_dict()
        self.model = opt.model_class(bert_model, opt).to(opt.device)
        trainset = BertSentenceDataset(opt.dataset_file['train'], tokenizer, target_dim=self.opt.polarities_dim, opt=opt)
        testset = BertSentenceDataset(opt.dataset_file['test'], tokenizer, target_dim=self.opt.polarities_dim, opt=opt)
        if opt.datatype == 'diadata':
            self.train_dataloader = DataLoader(dataset=trainset, batch_size=opt.train_batch_size, shuffle=True, collate_fn=collate_wrapper)   # , drop_last=True
            self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False, collate_fn=collate_wrapper)
        else:
            self.train_dataloader = DataLoader(dataset=trainset, batch_size=opt.train_batch_size, shuffle=True)   # , drop_last=True
            self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False)

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(self.opt.device.index)))
        self._print_args()