def get_dataloader(train_data, valid_data): train_dataset = PairDataset(train_data, num_neg=0) valid_dataset = PairDataset(valid_data, num_neg=0) padding = MultiQAPadding(fixed_length_uttr=fixed_length_uttr, fixed_length_resp=fixed_length_resp, fixed_length_turn=fixed_length_turn, data_type=data_type) train_dataloader = DictDataLoader(train_dataset, batch_size=batch_size, turns=fixed_length_turn, stage='train', shuffle=True, sort=False, callback=padding) valid_dataloader = DictDataLoader(valid_dataset, batch_size=batch_size, turns=fixed_length_turn, stage='dev', shuffle=False, sort=False, callback=padding) for i, (x, y) in enumerate(train_dataloader): # 打印Utterance的形状 logger.info(f"The shape of utternace is {x[constants.UTTRS].shape}") if i == 0: break return train_dataloader, valid_dataloader
def get_dataloader(data): dataset = PairDataset(data, num_neg=0) padding = MultiQAPadding(fixed_length_uttr=fixed_length_uttr, fixed_length_resp=fixed_length_resp, fixed_length_turn=fixed_length_turn) dataloader = DictDataLoader(dataset, batch_size=batch_size, turns=fixed_length_turn, stage='dev', shuffle=False, sort=False, callback=padding) return dataloader
def get_dataloader(self, utterance: str, responses: list): data = pd.DataFrame() turns = len(utterance.split("\t")) data[constants.RESP] = responses data[constants.UTTRS] = utterance data[constants.TURNS] = turns data[constants.LABEL] = 0 data = self.preprocessor.transform(data, drop=False) dataset = PairDataset(data, num_neg=0) padding = MultiQAPadding(self.uttr_len, self.resp_len, self.turns, data_type=self.data_type) dataloader = DictDataLoader(dataset, batch_size=len(dataset), turns=self.turns, stage="test", device=self.device, shuffle=False, sort=False, callback=padding) return dataloader
data['label'] = data['label'].astype(int) # 划分训练集和测试集 train = data[:90] valid = data[90:] # 加载预训练词向量 basename = "/home/speech/models" # 构建词向量矩阵 logger.info("读取词向量文件") word_embedding = load_from_file(Path(basename) / "500000-small.txt") embedding_matrix = word_embedding.build_matrix(preprocessor.context['term_index']) # 对训练集和验证集进行封装 logger.info("使用Dataset和DataLoader对数据进行封装") train_dataset = PairDataset(train, num_neg=0) valid_dataset = PairDataset(valid, num_neg=0) padding = MultiQAPadding(fixed_length_uttr=fixed_length_uttr, fixed_length_resp=fixed_length_resp, fixed_length_turn=fixed_length_turn) train_dataloader = DictDataLoader(train_dataset, batch_size=16, turns=fixed_length_turn, shuffle=False, sort=False, callback=padding) valid_dataloader = DictDataLoader(valid_dataset, batch_size=16, turns=fixed_length_turn, shuffle=False, sort=False, callback=padding)
## --------------------- 01 测试预处理类 -------------------------- print("对数据进行预处理...") preprocessor = CNAlbertPreprocessorForMultiQA(Path(albert_path) / vocab_file, uttr_len=fixed_length_uttr, resp_len=fixed_length_resp) data = preprocessor.transform(data) data = data[[ 'D_num', 'turns', 'utterances', 'response', 'utterances_len', 'response_len' ]] data['label'] = 1 ## --------------------- 02 封装数据 -------------------------- dataset = PairDataset(data, num_neg=0) padding = MultiQAPadding(fixed_length_uttr=fixed_length_uttr, fixed_length_resp=fixed_length_resp, fixed_length_turn=fixed_length_turn) dataloader = DictDataLoader(dataset, batch_size=batch_size, turns=fixed_length_turn, shuffle=False, sort=False, callback=padding) ## -------------------- 03 定义模型并前向传播 ----------------------- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"当前正在测试的模型 {name.upper()}") print("定义模型和参数...")