def evaluate(self, iterator): self.model.eval() epoch_loss = 0 epoch_acc = 0 with torch.no_grad(): for _, batch in enumerate(iterator): label = batch["label"] text = batch["text"] input_ids, token_type_ids = convert_text_to_ids( self.tokenizer, text) input_ids = seq_padding(self.tokenizer, input_ids) token_type_ids = seq_padding(self.tokenizer, token_type_ids) label = label.unsqueeze(1) input_ids, token_type_ids, label = input_ids.long( ), token_type_ids.long(), label.long() input_ids, token_type_ids, label = input_ids.to( self.device), token_type_ids.to(self.device), label.to( self.device) output = self.model(input_ids=input_ids, token_type_ids=token_type_ids, labels=label) y_pred_label = output[1].argmax(dim=1) loss = output[0] acc = ((y_pred_label == label.view(-1)).sum()).item() epoch_loss += loss.item() epoch_acc += acc return epoch_loss / len(iterator), epoch_acc / len( iterator.dataset.dataset)
def get_embedding(self, text1): x1token = ['[CLS]'] + self.tokenizer.tokenize(text1) + ['[SEP]'] x1mask = [1] * len(x1token) x1mask = FloatTensor( utils.seq_padding(np.array([x1mask]), self.para.bert_maxlen)) x1ids = self.tokenizer.convert_tokens_to_ids(x1token) x1ids = LongTensor( utils.seq_padding(np.array([x1ids]), self.para.bert_maxlen)) x1 = self.my_model.bert_embedding([x1ids, x1mask]) x1 = x1 / torch.sqrt(torch.sum(x1 * x1, -1, keepdim=True)) print('x1 shape: ', np.shape(x1[0])) return x1[0].cpu().detach().numpy()
def get_embedding_list(self, text1): x1token = ['[CLS]'] + self.tokenizer.tokenize(text1) + ['[SEP]'] x1mask = [1] * len(x1token) x1mask = FloatTensor( utils.seq_padding(np.array([x1mask]), self.para.bert_maxlen)) x1ids = self.tokenizer.convert_tokens_to_ids(x1token) x1ids = LongTensor( utils.seq_padding(np.array([x1ids]), self.para.bert_maxlen)) x1 = self.my_model.bert_embedding.bert_embedding_model( x1ids, attention_mask=x1mask) x1 = x1[0][-2:] x1 = torch.cat(x1, -1) x1mask = x1mask.view(-1, self.para.bert_maxlen, 1) x1mask = x1mask.expand(-1, -1, 2048) x1 = x1 * x1mask # 将mask位置的向量置零 return x1[0, :len(x1token), :].cpu().detach().numpy()
def predict(self, sentence): input_ids, token_type_ids = convert_text_to_ids( self.tokenizer, sentence) input_ids = seq_padding(self.tokenizer, [input_ids]) token_type_ids = seq_padding(self.tokenizer, [token_type_ids]) # 需要 LongTensor input_ids, token_type_ids = input_ids.long(), token_type_ids.long() # 梯度清零 self.optimizer.zero_grad() # 迁移到GPU input_ids, token_type_ids = input_ids.to( self.device), token_type_ids.to(self.device) output = self.model(input_ids=input_ids, token_type_ids=token_type_ids) y_pred_prob = output[0] y_pred_label = y_pred_prob.argmax(dim=1) print(y_pred_label)
def get_batch_embedding(self, text_batch): X1ids = [] X1mask = [] for text1 in text_batch: x1token = ['[CLS]'] + self.tokenizer.tokenize(text1) + ['[SEP]'] x1mask = [1] * len(x1token) x1ids = self.tokenizer.convert_tokens_to_ids(x1token) X1ids.append(x1ids) X1mask.append(x1mask) X1ids = LongTensor( utils.seq_padding(np.array(X1ids), self.para.bert_maxlen)) X1mask = FloatTensor( utils.seq_padding(np.array(X1mask), self.para.bert_maxlen)) X1 = self.my_model.bert_embedding([X1ids, X1mask]) X1 = X1 / torch.sqrt(torch.sum(X1 * X1, -1, keepdim=True)) # print('X1 shape: ', np.shape(X1)) return X1
def splitBatch(self, en, cn, batch_size, shuffle=True): idx_list = np.arange(0, len(en), batch_size) if shuffle: np.random.shuffle(idx_list) batch_indexs = [] for idx in idx_list: batch_indexs.append(np.arange(idx, min(idx + batch_size, len(en)))) batches = [] for batch_index in batch_indexs: batch_en = [en[index] for index in batch_index] batch_cn = [cn[index] for index in batch_index] batch_cn = seq_padding(batch_cn) batch_en = seq_padding(batch_en) batches.append(Batch(batch_en, batch_cn)) return batches
def train_an_epoch(self, iterator): self.model_setup() epoch_loss = 0 epoch_acc = 0 for i, batch in enumerate(iterator): label = batch["label"] text = batch["text"] input_ids, token_type_ids = convert_text_to_ids( self.tokenizer, text) input_ids = seq_padding(self.tokenizer, input_ids) token_type_ids = seq_padding(self.tokenizer, token_type_ids) # 标签形状为 (batch_size, 1) label = label.unsqueeze(1) # 需要 LongTensor input_ids, token_type_ids, label = input_ids.long( ), token_type_ids.long(), label.long() # 梯度清零 self.optimizer.zero_grad() # 迁移到GPU input_ids, token_type_ids, label = input_ids.to( self.device), token_type_ids.to(self.device), label.to( self.device) output = self.model(input_ids=input_ids, token_type_ids=token_type_ids, labels=label) y_pred_prob = output[1] y_pred_label = y_pred_prob.argmax(dim=1) # 计算loss # 这个 loss 和 output[0] 是一样的 loss = self.criterion(y_pred_prob.view(-1, 2), label.view(-1)) #loss = output[0] # 计算acc acc = ((y_pred_label == label.view(-1)).sum()).item() # 反向传播 loss.backward() self.optimizer.step() # epoch 中的 loss 和 acc 累加 epoch_loss += loss.item() epoch_acc += acc if i % 200 == 0: print("current loss:", epoch_loss / (i + 1), "\t", "current acc:", epoch_acc / ((i + 1) * len(label))) return epoch_loss / len(iterator), epoch_acc / len( iterator.dataset.dataset)
def __get_testbatch__(self, num): start_idx = num*self.batch_size end_idx = [start_idx+self.batch_size, len(self.data)][start_idx+self.batch_size > len(self.data)] idxs = [start_idx + i for i in range(end_idx - start_idx)] X1, X2, Y = [], [], [] for i in idxs: d = self.data[i] text = d[0][:self.pad_size] x1, x2 = tokenizer.encode(first=text) X1.append(x1) X2.append(x2) label = [0] * 202 for p in d[1]: label[self.tag2id[p]] = 1 Y.append(label) #print(X1) X1 = np.array(seq_padding(X1)) X2 = np.array(seq_padding(X2)) Y = np.array(seq_padding(Y)) return [X1, X2], Y
def splitBatch(self, en, cn, batch_size, shuffle=True): """ 将以单词id列表表示的翻译前(英文)数据和翻译后(中文)数据 按照指定的batch_size进行划分 如果shuffle参数为True,则会对这些batch数据顺序进行随机打乱 排序之后,一个batch深入,填充的位置会变少 """ # 在按数据长度生成的各条数据下标列表[0, 1, ..., len(en)-1]中 # 每隔指定长度(batch_size)取一个下标作为后续生成batch的起始下标 idx_list = np.arange(0, len(en), batch_size) # 如果shuffle参数为True,则将这些各batch起始下标打乱 if shuffle: np.random.shuffle(idx_list) # 存放各个batch批次的句子数据索引下标 batch_indexs = [] for idx in idx_list: # 注意,起始下标最大的那个batch可能会超出数据大小 # 因此要限定其终止下标不能超过数据大小 """ 形如[array([4, 5, 6, 7]), array([0, 1, 2, 3]), array([8, 9, 10, 11]), ...] """ batch_indexs.append(np.arange(idx, min(idx + batch_size, len(en)))) # 按各batch批次的句子数据索引下标,构建实际的单词id列表表示的各batch句子数据 batches = [] for batch_index in batch_indexs: # 按当前batch的各句子下标(数组批量索引)提取对应的单词id列表句子表示数据 batch_en = [en[index] for index in batch_index] batch_cn = [cn[index] for index in batch_index] # 对当前batch的各个句子都进行padding对齐长度 # 维度为:batch数量×batch_size×每个batch最大句子长度 batch_cn = seq_padding(batch_cn) batch_en = seq_padding(batch_en) # 将当前batch的英文和中文数据添加到存放所有batch数据的列表中 batches.append(Batch(batch_en, batch_cn)) return batches
def __iter__(self): train_data = self.data while True: idxs = [i for i in range(len(train_data))] np.random.shuffle(idxs) X1, X2, Y = [], [], [] for i in idxs: #print(i) d = train_data[i] text = d[0][:self.pad_size] x1, x2 = tokenizer.encode(first=text) X1.append(x1) X2.append(x2) label = [0] * 202 for p in d[1]: label[self.tag2id[p]] = 1 Y.append(label) if len(X1) == self.batch_size or i == idxs[-1]: X1 = np.array(seq_padding(X1)) X2 = np.array(seq_padding(X2)) Y = np.array(seq_padding(Y)) yield [X1, X2], Y X1, X2, Y = [], [], []
def predict(self, sentence): self.model.setup() self.model.eval() # 转token后padding input_ids, token_type_ids = convert_text_to_ids( self.tokenizer, sentence) input_ids = seq_padding(self.tokenizer, [input_ids]) token_type_ids = seq_padding(self.tokenizer, [token_type_ids]) # 需要 LongTensor input_ids, token_type_ids = input_ids.long(), token_type_ids.long() # 梯度清零 self.optimizer.zero_grad() # 迁移到GPU input_ids, token_type_ids = input_ids.to( self.device), token_type_ids.to(self.device) output = self.model(input_ids=input_ids, token_type_ids=token_type_ids) # y_pred_prob:各个类别的概率 y_pred_prob = output[0] # 取概率最大的标签 y_pred_label = y_pred_prob.argmax(dim=1) # 将torch.tensor转换回int形式 return y_pred_label.item()
def get_sim(self, text1, text2): x1token = ['[CLS]'] + tokenizer.tokenize(text1) + ['[SEP]'] x2token = ['[CLS]'] + tokenizer.tokenize(text2) + ['[SEP]'] x1mask = [1] * len(x1token) x2mask = [1] * len(x2token) x1mask = FloatTensor( utils.seq_padding(np.array([x1mask]), para.bert_maxlen)) x2mask = FloatTensor( utils.seq_padding(np.array([x2mask]), para.bert_maxlen)) x1ids = tokenizer.convert_tokens_to_ids(x1token) x2ids = tokenizer.convert_tokens_to_ids(x2token) x1ids = LongTensor( utils.seq_padding(np.array([x1ids]), para.bert_maxlen)) x2ids = LongTensor( utils.seq_padding(np.array([x2ids]), para.bert_maxlen)) x1 = self.my_model.bert_embedding([x1ids, x1mask]) x2 = self.my_model.bert_embedding([x2ids, x2mask]) x1 = x1 / torch.sqrt(torch.sum(x1 * x1, -1, keepdim=True)) x2 = x2 / torch.sqrt(torch.sum(x2 * x2, -1, keepdim=True)) x1x2 = torch.sum(x1 * x2, -1) return x1x2[0].item()
def get_batch(self): files = self.files file_type = self.file_type batch_size = self.batch_size product_id_list = [] boxes_list = [] images_features_list = [] idx_class_labels_list = [] idx_class_labels_mask_list = [] idx_query_list = [] query_id_list = [] label_list = [] mask_query_list = [] mask_idx_query_list = [] mask_label_list = [] epoch_num = 0.0 while 1: random.shuffle(files) for filename in files: with open(os.path.join(KDD_DATA, filename), 'r', encoding='utf-8') as f: lines = f.readlines() index_list = [i for i in range(len(lines))] random.shuffle(index_list) for i, index in enumerate(index_list): try: line = lines[index] if "product_id" in line: continue product_id, boxes, images_features, idx_class_labels, idx_class_labels_mask, \ idx_query, query_id, query, class_label, mask_query, mask_idx_query, mask_label = read_line(line, self.dict_multimodal_labels, self.tokenizer) label_list.append(1) product_id_list.append(product_id) boxes_list.append(boxes) images_features_list.append(images_features) idx_class_labels_list.append(idx_class_labels) idx_class_labels_mask_list.append( idx_class_labels_mask) idx_query_list.append(idx_query) query_id_list.append(query_id) mask_query_list.append(mask_query) mask_idx_query_list.append(mask_idx_query) mask_label_list.append(mask_label) if len(product_id_list) == batch_size or i == ( len(index_list) - 1): np_boxes, _ = seq_padding_2(boxes_list, maxlen=MAX_BOX_NUM, padding_value=0) np_images_features, np_images_features_mask = seq_padding_2( images_features_list, maxlen=MAX_BOX_NUM, padding_value=0) np_idx_class_labels, _ = seq_padding_2( idx_class_labels_list, maxlen=MAX_BOX_NUM, padding_value=0) np_idx_class_labels_mask, _ = seq_padding_2( idx_class_labels_mask_list, maxlen=MAX_BOX_NUM, padding_value=0) np_idx_query, np_idx_query_mask = seq_padding( idx_query_list, maxlen=MAX_LENGTH, padding_value=0) np_mask_idx_query, np_mask_idx_query_mask = seq_padding( mask_idx_query_list, maxlen=MAX_LENGTH, padding_value=0) np_mask_label, _ = seq_padding(mask_label_list, maxlen=MAX_LENGTH, padding_value=-1) yield product_id_list, np_boxes, np_images_features, np_images_features_mask, \ np_idx_class_labels, np_idx_class_labels_mask, \ query_id_list, np_idx_query, np_idx_query_mask, \ np_mask_idx_query, np_mask_idx_query_mask, np_mask_label, \ np.array(label_list) product_id_list = [] boxes_list = [] images_features_list = [] idx_class_labels_list = [] idx_class_labels_mask_list = [] idx_query_list = [] query_id_list = [] label_list = [] mask_query_list = [] mask_idx_query_list = [] mask_label_list = [] except Exception as e: import traceback traceback.print_exc() continue
print(evaluator.get_embedding(text1)) for epoch in range(para.epoch): loss_list = [] for step, data in tqdm(enumerate(train_loader)): X1, X2 = data X1ids, X2ids, X1mask, X2mask = [], [], [], [] for i in range(len(X1)): x1token = ['[CLS]'] + tokenizer.tokenize(X1[i]) + ['[SEP]'] x2token = ['[CLS]'] + tokenizer.tokenize(X2[i]) + ['[SEP]'] X1ids.append(tokenizer.convert_tokens_to_ids(x1token)) X2ids.append(tokenizer.convert_tokens_to_ids(x2token)) X1mask.append([1] * len(x1token)) X2mask.append([1] * len(x2token)) X1ids = LongTensor( utils.seq_padding(np.array(X1ids), para.bert_maxlen)) X2ids = LongTensor( utils.seq_padding(np.array(X2ids), para.bert_maxlen)) X1mask = FloatTensor( utils.seq_padding(np.array(X1mask), para.bert_maxlen)) X2mask = FloatTensor( utils.seq_padding(np.array(X2mask), para.bert_maxlen)) # 使用dataParallel之后下面两行需要加module() X1embed = model.bert_embedding([X1ids, X1mask]) X2embed = model.bert_embedding([X2ids, X2mask]) T1embed, T2embed = \ utils.get_pairs([X1embed.cpu().data.numpy(), X2embed.cpu().data.numpy()]) T1embed = FloatTensor(T1embed)