def val(test_loader): test_iter = iter(test_loader) max_iter = len(data_loader) n_correct = 0 n_total = 0 for i in range(max_iter): data = test_iter.next() cpu_images = data[0] cpu_labels = data[1] [num] = cpu_labels.shape utils.loadData(image, cpu_images) preds = MODEL(image) arg_max = preds.argmax(1).cpu.numpy() labels = cpu_labels.numpy() correct = np.sum(arg_max == labels) n_correct += correct n_total += num acc = n_correct / float(n_total) return acc
def demo(image_path): image = Image.open(image_path).convert('L') image = transformer(image) if cuda_flag: image = image.cuda() image = image.view(1, *image.size()) image = Variable(image) text = torch.LongTensor(1 * 5) length = torch.IntTensor(1) text = Variable(text) length = Variable(length) max_iter = 20 t, l = converter.encode('0' * max_iter) utils.loadData(text, t) utils.loadData(length, l) output = MORAN(image, length, text, text, test=True, debug=True) # output image preds, preds_reverse = output[0] # 中间输出:moran demo = output[1] _, preds = preds.max(1) _, preds_reverse = preds_reverse.max(1) sim_preds = converter.decode(preds.data, length.data) sim_preds = sim_preds.strip().split('$')[0] sim_preds_reverse = converter.decode(preds_reverse.data, length.data) sim_preds_reverse = sim_preds_reverse.strip().split('$')[0] print(image_path) print('\nResult:\n' + 'Left to Right: ' + sim_preds + '\nRight to Left: ' + sim_preds_reverse) return sim_preds
def train_batch(): data = train_iter.next() cpu_images = data[0] cpu_labels = data[1] utils.loadData(image, cpu_images) utils.loadData(ori_label, cpu_labels)
def predict(self, img_batch): batch_size = int(img_batch.size(0)) if self.cuda_flag: img_batch = img_batch.cuda() # img_batch = Variable(img_batch) text = torch.LongTensor(batch_size * 5) length = torch.IntTensor(batch_size) # text = Variable(text) # length = Variable(length) max_iter = 20 t, l = self.converter.encode(['0' * max_iter] * batch_size) utils.loadData(text, t) utils.loadData(length, l) output = self.MORAN(img_batch, length, text, text, test=True, debug=True) return output, length
def trainBatch(): data1 = train_iter1.next() data2 = train_iter2.next() cpu_images = torch.cat((data1[0], data2[0]), 0) cpu_texts1 = data1[1] + data2[1] cpu_texts2 = data1[3] + data2[3] utils.loadData(image, cpu_images) t1, l1 = converter.encode(cpu_texts1, scanned=True) utils.loadData(text1_ori, t1) utils.loadData(length_ori, l1) t2, l2 = converter.encode(cpu_texts2, scanned=True) utils.loadData(text2_ori, t2) N = len(cpu_texts1) if opt.LR is True: preds1, preds2 = MODEL(image, length_ori, text1_ori, text2_ori, cpu_texts=cpu_texts1) text1_new = text1_ori text2_new = text2_ori cost_pred1 = criterion(preds1, text1_new) / 2.0 cost_pred2 = criterion(preds2, text2_new) / 2.0 loss_pred_avg1.add(cost_pred1) loss_pred_avg2.add(cost_pred2) cost = cost_pred1 + cost_pred2 else: preds1 = MODEL(image, length_ori, text1_ori, None, cpu_texts=cpu_texts1) text1_new = text1_ori cost_pred1 = criterion(preds1, text1_new) loss_pred_avg1.add(cost_pred1) cost = cost_pred1 loss_avg.add(cost) MODEL.zero_grad() cost.backward() optimizer.step() return cost
def train_batch(): data = train_iter.next() cpu_images = data[0] cpu_labels = data[1] utils.loadData(image, cpu_images) utils.loadData(ori_label, cpu_labels) # print('ori_label.shape',ori_label.shape) preds = MODEL(image) # print('pred---', preds.shape) # print('label--', ori_label.shape) cost = criterion(preds, ori_label) # print('cost-------', cost) loss.add(cost) MODEL.zero_grad() cost.backward() optimizer.step()
def trainBatch(): data = train_iter.next() cpu_images, cpu_texts, cpu_texts_rev = data # utils.loadData(image, encode_coordinates_fn(cpu_images)) utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) t_rev, _ = converter.encode(cpu_texts_rev, scanned=True) utils.loadData(text, t) utils.loadData(text_rev, t_rev) utils.loadData(length, l) preds0, preds1 = MORAN(image, length, text, text_rev) cost = criterion(torch.cat([preds0, preds1], 0), torch.cat([text, text_rev], 0)) MORAN.zero_grad() cost.backward() optimizer.step() return cost
def train(net, criterion, optimizer, data): cpu_images, cpu_texts = data batch_size = cpu_images.size(0) # 计算当前batch_size大小 utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts) # 转换为类别 utils.loadData(text, t) utils.loadData(length, l) optimizer.zero_grad() # 清零梯度 preds = net(image) preds_size = Variable(torch.LongTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size cost.backward() optimizer.step() return cost
def val(net, criterion, eval_data_batch): print('Start val') for p in crnn.parameters(): p.requires_grad = False net.eval() n_correct = 0 loss_avg_eval = utils.averager() for data in eval_data_batch: cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.LongTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size loss_avg_eval.add(cost) # 计算loss _, preds = preds.max(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) cpu_texts_decode = [] for i in cpu_texts: cpu_texts_decode.append(i) for pred, target in zip(sim_preds, cpu_texts_decode): # 计算准确率 if pred == target: n_correct += 1 raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:config.n_val_disp] for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts_decode): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) accuracy = n_correct / float(len(eval_dataset)) print('Val loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def val(dataset, criterion, max_iter=10000, steps=0): data_loader = torch.utils.data.DataLoader( dataset, shuffle=False, batch_size=args.batchSize, num_workers=args.workers) # args.batchSize val_iter = iter(data_loader) max_iter = min(max_iter, len(data_loader)) n_correct = 0 n_total = 0 distance = 0.0 loss_avg = utils.averager() f = open('logger/log.txt', 'w', encoding='utf-8') for i in range(max_iter): data = val_iter.next() cpu_images, cpu_texts, cpu_texts_rev = data # utils.loadData(image, encode_coordinates_fn(cpu_images)) utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) t_rev, _ = converter.encode(cpu_texts_rev, scanned=True) utils.loadData(text, t) utils.loadData(text_rev, t_rev) utils.loadData(length, l) preds0, _, preds1, _ = MORAN(image, length, text, text_rev, debug=False, test=True, steps=steps) cost = criterion(torch.cat([preds0, preds1], 0), torch.cat([text, text_rev], 0)) preds0_prob, preds0 = preds0.max(1) preds0 = preds0.view(-1) preds0_prob = preds0_prob.view(-1) sim_preds0 = converter.decode(preds0.data, length.data) preds1_prob, preds1 = preds1.max(1) preds1 = preds1.view(-1) preds1_prob = preds1_prob.view(-1) sim_preds1 = converter.decode(preds1.data, length.data) sim_preds = [] for j in range(cpu_images.size(0)): text_begin = 0 if j == 0 else length.data[:j].sum() if torch.mean(preds0_prob[text_begin:text_begin + len(sim_preds0[j].split('$')[0] + '$')]).item() > \ torch.mean(preds1_prob[text_begin:text_begin + len(sim_preds1[j].split('$')[0] + '$')]).item(): sim_preds.append(sim_preds0[j].split('$')[0] + '$') else: sim_preds.append(sim_preds1[j].split('$')[0][-1::-1] + '$') # img_shape = cpu_images.shape[3] / 100, cpu_images.shape[2] / 100 # input_seq = cpu_texts[0] # output_seq = sim_preds[0] # attention = alpha[0] # attention_image = showAttention(input_seq, output_seq, attention, img_shape) # log.image_summary('map/attention', [attention_image], steps) loss_avg.add(cost) for pred, target in zip(sim_preds, cpu_texts): if pred == target.lower(): n_correct += 1 f.write("pred %s\t\t\t\t\ttarget %s\n" % (pred, target)) distance += levenshtein(pred, target) / max(len(pred), len(target)) n_total += 1 f.close() accuracy = n_correct / float(n_total) log.scalar_summary('Validation/levenshtein distance', distance / n_total, steps) log.scalar_summary('Validation/loss', loss_avg.val(), steps) log.scalar_summary('Validation/accuracy', accuracy, steps) return accuracy
def val_beam(dataset, max_iter=9999): rotate90 = dataset.ifRotate90 data_loader = torch.utils.data.DataLoader(dataset, shuffle=False, batch_size=opt.batchSize, num_workers=1) # opt.batchSize val_iter = iter(data_loader) max_iter = min(max_iter, len(data_loader)) n_correct = 0 n_total = 0 for i in range(max_iter): data = val_iter.next() ori_cpu_images = data[0] flag_rotate90 = data[2] cpu_texts1 = data[1] cpu_texts2 = data[3] t1, l1 = converter.encode(cpu_texts1, scanned=True) t2, l2 = converter.encode(cpu_texts2, scanned=True) utils.loadData(text1_ori, t1) utils.loadData(text2_ori, t2) utils.loadData(length_ori, l1) All_preds_add5EOS1 = [] All_scores1 = [] All_preds_add5EOS2 = [] All_scores2 = [] cpu_images = ori_cpu_images utils.loadData(image, cpu_images) if opt.LR: local_preds1, local_scores1, local_preds2, local_scores2 = MODEL( image, length_ori, text1_ori, text2_ori, test=True, cpu_texts=cpu_texts1) All_preds_add5EOS1.append(local_preds1) All_preds_add5EOS2.append(local_preds2) All_scores1.append(local_scores1) All_scores2.append(local_scores2) else: local_preds1, local_scores1 = MODEL(image, length_ori, text1_ori, None, test=True, cpu_texts=cpu_texts1) All_preds_add5EOS1.append(local_preds1) All_scores1.append(local_scores1) length_label = (length_ori - 1).data.cpu().numpy() # %%% Left/Right Rotate %%% if rotate90 == True: PIL_imgs = [ toPIL(ori_cpu_images[i].div(2).sub(-0.5)) for i in range(ori_cpu_images.shape[0]) ] PIL_imgs_left90 = [ PIL_imgs[i].transpose(Image.ROTATE_90).resize( (opt.imgW, opt.imgH), Image.BILINEAR) if flag_rotate90[i] else PIL_imgs[i] for i in range(ori_cpu_images.shape[0]) ] PIL_imgs_right90 = [ PIL_imgs[i].transpose(Image.ROTATE_270).resize( (opt.imgW, opt.imgH), Image.BILINEAR) if flag_rotate90[i] else PIL_imgs[i] for i in range(ori_cpu_images.shape[0]) ] imgs_Tensor_left90 = [ toTensor(PIL_imgs_left90[i]) for i in range(ori_cpu_images.shape[0]) ] imgs_Tensor_right90 = [ toTensor(PIL_imgs_right90[i]) for i in range(ori_cpu_images.shape[0]) ] # Left cpu_images = torch.stack(imgs_Tensor_left90) cpu_images.sub_(0.5).div_(0.5) utils.loadData(image, cpu_images) if opt.LR: local_preds1, local_scores1, local_preds2, local_scores2, _ = MODEL( image, length_ori, text1_ori, text2_ori, test=True, cpu_texts=cpu_texts1) All_preds_add5EOS1.append(local_preds1) All_preds_add5EOS2.append(local_preds2) All_scores1.append(local_scores1) All_scores2.append(local_scores2) else: local_preds1, local_scores1, _ = MODEL(image, length_ori, text1_ori, None, test=True, cpu_texts=cpu_texts1) All_preds_add5EOS1.append(local_preds1) All_scores1.append(local_scores1) # Right cpu_images = torch.stack(imgs_Tensor_right90) cpu_images.sub_(0.5).div_(0.5) utils.loadData(image, cpu_images) if opt.LR: local_preds1, local_scores1, local_preds2, local_scores2, _ = MODEL( image, length_ori, text1_ori, text2_ori, test=True, cpu_texts=cpu_texts1) All_preds_add5EOS1.append(local_preds1) All_preds_add5EOS2.append(local_preds2) All_scores1.append(local_scores1) All_scores2.append(local_scores2) else: local_preds1, local_scores1, _ = MODEL(image, length_ori, text1_ori, None, test=True, cpu_texts=cpu_texts1) All_preds_add5EOS1.append(local_preds1) All_scores1.append(local_scores1) # Start to decode preds_add5EOS1 = [] preds_score1 = [] for j in range(cpu_images.size(0)): text_begin = 0 if j == 0 else (length_ori.data[:j].sum() + j * 5) max_score = -99999 max_index = 0 for index in range(len(All_scores1)): local_score = All_scores1[index][j] if local_score > max_score: max_score = local_score max_index = index preds_add5EOS1.extend( All_preds_add5EOS1[max_index][text_begin:text_begin + int(length_ori[j].data) + 5]) preds_score1.append(max_score) preds_add5EOS1 = torch.stack(preds_add5EOS1) sim_preds_add5eos1 = converter.decode(preds_add5EOS1.data, length_ori.data + 5) if opt.LR: preds_add5EOS2 = [] preds_score2 = [] for j in range(cpu_images.size(0)): text_begin = 0 if j == 0 else (length_ori.data[:j].sum() + j * 5) max_score = -99999 max_index = 0 for index in range(len(All_scores2)): local_score = All_scores2[index][j] if local_score > max_score: max_score = local_score max_index = index preds_add5EOS2.extend( All_preds_add5EOS2[max_index][text_begin:text_begin + int(length_ori[j].data) + 5]) preds_score2.append(max_score) preds_add5EOS2 = torch.stack(preds_add5EOS2) sim_preds_add5eos2 = converter.decode(preds_add5EOS2.data, length_ori.data + 5) if opt.LR: batch_index = 0 for pred1, target1, pred2, target2 in zip(sim_preds_add5eos1, cpu_texts1, sim_preds_add5eos2, cpu_texts2): if preds_score1[batch_index] > preds_score2[batch_index]: pred = pred1 target = target1 else: pred = pred2 target = target2 pred = pred.split(opt.sep)[0] + opt.sep test_alphabet = dataset.test_alphabet.split(opt.sep) pred = ''.join( pred[i].lower() if pred[i].lower() in test_alphabet else '' for i in range(len(pred))) target = ''.join(target[i].lower() if target[i].lower() in test_alphabet else '' for i in range(len(target))) if pred.lower() == target.lower(): n_correct += 1 n_total += 1 batch_index += 1 else: for pred, target in zip(sim_preds_add5eos1, cpu_texts1): pred = pred.split(opt.sep)[0] + opt.sep test_alphabet = dataset.test_alphabet.split(opt.sep) pred = ''.join( pred[i].lower() if pred[i].lower() in test_alphabet else '' for i in range(len(pred))) target = ''.join(target[i].lower() if target[i].lower() in test_alphabet else '' for i in range(len(target))) if pred.lower() == target.lower(): n_correct += 1 n_total += 1 accuracy = n_correct / float(n_total) dataset_name = dataset.root.split('/')[-1] print(dataset_name + ' ACCURACY -----> %.1f%%, ' % (accuracy * 100.0)) return accuracy
def val(dataset, criterion, max_iter=1000): print('Start val') data_loader = torch.utils.data.DataLoader( dataset, shuffle=False, batch_size=opt.batchSize, num_workers=int(opt.workers)) # opt.batchSize val_iter = iter(data_loader) max_iter = min(max_iter, len(data_loader)) n_correct = 0 n_total = 0 distance = 0.0 loss_avg = utils.averager() f = open('./log.txt','a',encoding='utf-8') for i in range(max_iter): data = val_iter.next() if opt.BidirDecoder: cpu_images, cpu_texts, cpu_texts_rev = data utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) t_rev, _ = converter.encode(cpu_texts_rev, scanned=True) utils.loadData(text, t) utils.loadData(text_rev, t_rev) utils.loadData(length, l) preds0, preds1 = MORAN(image, length, text, text_rev, test=True) cost = criterion(torch.cat([preds0, preds1], 0), torch.cat([text, text_rev], 0)) preds0_prob, preds0 = preds0.max(1) preds0 = preds0.view(-1) preds0_prob = preds0_prob.view(-1) sim_preds0 = converter.decode(preds0.data, length.data) preds1_prob, preds1 = preds1.max(1) preds1 = preds1.view(-1) preds1_prob = preds1_prob.view(-1) sim_preds1 = converter.decode(preds1.data, length.data) sim_preds = [] for j in range(cpu_images.size(0)): text_begin = 0 if j == 0 else length.data[:j].sum() if torch.mean(preds0_prob[text_begin:text_begin+len(sim_preds0[j].split('$')[0]+'$')]).data[0] >\ torch.mean(preds1_prob[text_begin:text_begin+len(sim_preds1[j].split('$')[0]+'$')]).data[0]: sim_preds.append(sim_preds0[j].split('$')[0]+'$') else: sim_preds.append(sim_preds1[j].split('$')[0][-1::-1]+'$') else: cpu_images, cpu_texts = data utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) utils.loadData(text, t) utils.loadData(length, l) preds = MORAN(image, length, text, text_rev, test=True) cost = criterion(preds, text) _, preds = preds.max(1) preds = preds.view(-1) sim_preds = converter.decode(preds.data, length.data) loss_avg.add(cost) for pred, target in zip(sim_preds, cpu_texts): if pred == target.lower(): n_correct += 1 f.write("预测 %s 目标 %s\n" % ( pred,target ) ) distance += levenshtein(pred,target) / max(len(pred),len(target)) n_total += 1 f.close() print("correct / total: %d / %d, " % (n_correct, n_total)) print('levenshtein distance: %f' % (distance/n_total)) accuracy = n_correct / float(n_total) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy)) return accuracy
new_height = round(target_width * (height / width)) transformer = dataset.resizeNormalize((target_width, new_height)) image = transformer(image) if cuda_flag: image = image.cuda() image = image.view(1, *image.size()) image = Variable(image) text = torch.LongTensor(1 * 5) length = torch.IntTensor(1) text = Variable(text) length = Variable(length) max_iter = 20 t, l = converter.encode('0' * max_iter) utils.loadData(text, t) utils.loadData(length, l) output = MORAN(image, length, text, text, test=True, debug=True) preds, preds_reverse = output[0] demo = output[1] _, preds = preds.max(1) _, preds_reverse = preds_reverse.max(1) sim_preds = converter.decode(preds.data, length.data) sim_preds = sim_preds.strip().split('$')[0] # sim_preds_reverse = converter.decode(preds_reverse.data, length.data) # sim_preds_reverse = sim_preds_reverse.strip().split('$')[0] # print('\nResult:\n' + 'Left to Right: ' + sim_preds +
def trainBatch(steps): data = train_iter.next() if opt.BidirDecoder: cpu_images, cpu_texts, cpu_texts_rev = data utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) t_rev, _ = converter.encode(cpu_texts_rev, scanned=True) utils.loadData(text, t) utils.loadData(text_rev, t_rev) utils.loadData(length, l) preds0, preds1 = MORAN(image, length, text, text_rev) cost = criterion(torch.cat([preds0, preds1], 0), torch.cat([text, text_rev], 0)) else: cpu_images, cpu_texts = data utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) utils.loadData(text, t) utils.loadData(length, l) preds = MORAN(image, length, text, text_rev) cost = criterion(preds, text) MORAN.zero_grad() cost.backward() # 反向传播 optimizer.step() # 优化器 return cost
transformer = dataset.resizeNormalize((100, 32)) image = Image.open(img_path).convert('L') image = transformer(image) #读取灰度图像并将其转换成100*32(w,h), image:1x32x100 if cuda_flag: image = image.cuda() image = image.view(1, *image.size()) # 1x1x32x100 image = Variable(image) text = torch.LongTensor(1 * 5) length = torch.IntTensor(1) text = Variable(text) length = Variable(length) max_iter = 20 t, l = converter.encode('0' * max_iter) # 初始化文本内容和文本长度t=20*'0', l=20 utils.loadData(text, t) #将初始化的值赋值到text和l上 utils.loadData(length, l) ################# 3-模型输出 ####################################### output = MORAN(image, length, text, text, test=True, debug=True) #这里初始双向的结果 preds, preds_reverse = output[0] #双向结果 demo = output[1] #test debug阶段输出矫正的文本 _, preds = preds.max(1) _, preds_reverse = preds_reverse.max(1) sim_preds = converter.decode(preds.data, length.data) #将预测的文本概率转换成文本, jewelers$e$e$e$ sim_preds = sim_preds.strip().split('$')[0] #jewelers sim_preds_reverse = converter.decode(preds_reverse.data,
def predict(self, msg, img, rot=0): # # Preprocessing gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) (rows, cols, channels) = img.shape mask = np.zeros([rows, cols], dtype = np.uint8) for text_bb in msg.text_array: if (text_bb.box.ymax - text_bb.box.ymin) * (text_bb.box.xmax - text_bb.box.xmin) < self.bbox_thres: continue start = time.time() image = gray[text_bb.box.ymin:text_bb.box.ymax, text_bb.box.xmin:text_bb.box.xmax] image = Im.fromarray(image) image = self.transformer(image) if self.cuda_use: image = image.cuda() image = image.view(1, *image.size()) image = Variable(image) text = torch.LongTensor(1 * 5) length = torch.IntTensor(1) text = Variable(text) length = Variable(length) max_iter = 20 t, l = self.converter.encode('0'*max_iter) utils.loadData(text, t) utils.loadData(length, l) output = self.network(image, length, text, text, test=True, debug=True) preds, preds_reverse = output[0] demo = output[1] _, preds = preds.max(1) _, preds_reverse = preds_reverse.max(1) sim_preds = self.converter.decode(preds.data, length.data) sim_preds = sim_preds.strip().split('$')[0] sim_preds_reverse = self.converter.decode(preds_reverse.data, length.data) sim_preds_reverse = sim_preds_reverse.strip().split('$')[0] # print('\nResult:\n' + 'Left to Right: ' + sim_preds + '\nRight to Left: ' + sim_preds_reverse + '\n\n') print "Text Recognize Time : {}".format(time.time() - start) _cont = [] for p in text_bb.contour: point = [] point.append(p.point[0]) point.append(p.point[1]) _cont.append(point) _cont = np.array(_cont, np.int32) if sim_preds in self.commodity_list: cv2.rectangle(img, (text_bb.box.xmin, text_bb.box.ymin),(text_bb.box.xmax, text_bb.box.ymax), self.color_map[rot], 3) cv2.putText(img, sim_preds, (text_bb.box.xmin, text_bb.box.ymin), 0, 1, (0, 255, 255),3) pix = self.commodity_list.index(sim_preds) + rot*len(self.commodity_list) if pix in np.unique(mask): cv2.fillConvexPoly(mask, _cont, pix + 4*len(self.commodity_list)) else: cv2.fillConvexPoly(mask, _cont, pix) else: correct, conf, _bool = self.conf_of_word(sim_preds) # print conf if _bool: cv2.putText(img, correct + "{:.2f}".format(conf), (text_bb.box.xmin, text_bb.box.ymin), 0, 1, (0, 255, 255),3) cv2.rectangle(img, (text_bb.box.xmin, text_bb.box.ymin),(text_bb.box.xmax, text_bb.box.ymax), (255, 255, 255), 2) pix = self.commodity_list.index(correct) + rot*len(self.commodity_list) if pix in np.unique(mask): cv2.fillConvexPoly(mask, _cont, pix + 4*len(self.commodity_list)) else: cv2.fillConvexPoly(mask, _cont, pix) # else: # cv2.putText(img, sim_preds, (text_bb.box.xmin, text_bb.box.ymin), 0, 1, (0, 0, 0),3) # cv2.rectangle(img, (text_bb.box.xmin, text_bb.box.ymin),(text_bb.box.xmax, text_bb.box.ymax), (0, 0, 0), 2) return img, mask
def trainBatch(): data = train_iter.next() if opt.BidirDecoder: cpu_images, cpu_texts, cpu_texts_rev = data #读取标签数据 utils.loadData(image, cpu_images) #将图像数据赋值给image t, l = converter.encode(cpu_texts, scanned=True) #将文本编码成类别标签 t_rev, _ = converter.encode(cpu_texts_rev, scanned=True) #将反向文本编码成类别标签 utils.loadData(text, t) #将正向文本标签赋予t utils.loadData(text_rev, t_rev) #将反向文本标签赋予t_rev utils.loadData(length, l) preds0, preds1 = MORAN(image, length, text, text_rev) #输出正向和反向识别概率 cost = criterion(torch.cat([preds0, preds1], 0), torch.cat([text, text_rev], 0)) #计算交叉熵损失 else: cpu_images, cpu_texts = data utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) utils.loadData(text, t) utils.loadData(length, l) preds = MORAN(image, length, text, text_rev) cost = criterion(preds, text) MORAN.zero_grad() cost.backward() optimizer.step() return cost
def val(dataset, criterion, max_iter=1000): """ validation :param dataset: 验证集 :param criterion: loss函数 :param max_iter: 迭代次数 :return: """ print('Start val') data_loader = torch.utils.data.DataLoader( dataset, shuffle=False, batch_size=opt.batchSize, num_workers=int(opt.workers)) # opt.batchSize val_iter = iter(data_loader) max_iter = min(max_iter, len(data_loader)) n_correct = 0 n_total = 0 loss_avg = utils.averager() for i in range(max_iter): data = val_iter.next() if opt.BidirDecoder: cpu_images, cpu_texts, cpu_texts_rev = data utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) # label convert t_rev, _ = converter.encode(cpu_texts_rev, scanned=True) # rev label,用于双向lstm utils.loadData(text, t) utils.loadData(text_rev, t_rev) utils.loadData(length, l) preds0, preds1 = MORAN(image, length, text, text_rev, test=True) cost = criterion(torch.cat([preds0, preds1], 0), torch.cat([text, text_rev], 0)) preds0_prob, preds0 = preds0.max(1) preds0 = preds0.view(-1) preds0_prob = preds0_prob.view(-1) sim_preds0 = converter.decode(preds0.data, length.data) preds1_prob, preds1 = preds1.max(1) preds1 = preds1.view(-1) preds1_prob = preds1_prob.view(-1) sim_preds1 = converter.decode(preds1.data, length.data) sim_preds = [] for j in range(cpu_images.size(0)): text_begin = 0 if j == 0 else length.data[:j].sum() if torch.mean(preds0_prob[text_begin:text_begin+len(sim_preds0[j].split('$')[0]+'$')]).data[0] >\ torch.mean(preds1_prob[text_begin:text_begin+len(sim_preds1[j].split('$')[0]+'$')]).data[0]: sim_preds.append(sim_preds0[j].split('$')[0] + '$') else: sim_preds.append(sim_preds1[j].split('$')[0][-1::-1] + '$') else: cpu_images, cpu_texts = data utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) utils.loadData(text, t) utils.loadData(length, l) preds = MORAN(image, length, text, text_rev, test=True) cost = criterion(preds, text) _, preds = preds.max(1) preds = preds.view(-1) sim_preds = converter.decode(preds.data, length.data) # cal acc loss_avg.add(cost) for pred, target in zip(sim_preds, cpu_texts): if pred == target.lower(): n_correct += 1 n_total += 1 print("correct / total: %d / %d, " % (n_correct, n_total)) accuracy = n_correct / float(n_total) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy)) return accuracy
def val(dataset, criterion, max_iter=10000, steps=None): data_loader = torch.utils.data.DataLoader( dataset, shuffle=False, batch_size=opt.batchSize, num_workers=int(opt.workers)) # opt.batchSize val_iter = iter(data_loader) max_iter = min(max_iter, len(data_loader)) n_correct = 0 n_total = 0 distance = 0.0 loss_avg = utils.averager() # f = open('./log.txt', 'a', encoding='utf-8') for i in range(max_iter): # 设置很大的循环数值(达不到此值就会收敛) data = val_iter.next() if opt.BidirDecoder: cpu_images, cpu_texts, cpu_texts_rev = data # data是dataloader导入的东西 utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=False) # 这个encode是将字符encode成id t_rev, _ = converter.encode(cpu_texts_rev, scanned=False) utils.loadData(text, t) utils.loadData(text_rev, t_rev) utils.loadData(length, l) preds0, preds1 = MORAN(image, length, text, text_rev, debug=False, test=True, steps=steps) # 跑模型HARN cost = criterion(torch.cat([preds0, preds1], 0), torch.cat([text, text_rev], 0)) preds0_prob, preds0 = preds0.max(1) # 取概率最大top1的结果 preds0 = preds0.view(-1) preds0_prob = preds0_prob.view(-1) # 维度的变形(好像是 sim_preds0 = converter.decode(preds0.data, length.data) # 将 id decode为字 preds1_prob, preds1 = preds1.max(1) preds1 = preds1.view(-1) preds1_prob = preds1_prob.view(-1) sim_preds1 = converter.decode(preds1.data, length.data) sim_preds = [] # 预测出来的字 for j in range(cpu_images.size(0)): # 对字典进行处理,把单个字符连成字符串 text_begin = 0 if j == 0 else length.data[:j].sum() if torch.mean(preds0_prob[text_begin:text_begin + len(sim_preds0[j].split('$')[0] + '$')]).item() > \ torch.mean( preds1_prob[text_begin:text_begin + len(sim_preds1[j].split('$')[0] + '$')]).item(): sim_preds.append(sim_preds0[j].split('$')[0] + '$') else: sim_preds.append(sim_preds1[j].split('$')[0][-1::-1] + '$') else: # 用不到的另一种情况 cpu_images, cpu_texts = data utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) utils.loadData(text, t) utils.loadData(length, l) preds = MORAN(image, length, text, text_rev, test=True) cost = criterion(preds, text) _, preds = preds.max(1) preds = preds.view(-1) sim_preds = converter.decode(preds.data, length.data) loss_avg.add(cost) # 计算loss的平均值 for pred, target in zip( sim_preds, cpu_texts ): # 与GroundTruth的对比,cpu_texts是GroundTruth,sim_preds是连接起来的字符串 if pred == target.lower(): # 完全匹配量 n_correct += 1 # f.write("pred %s\t target %s\n" % (pred, target)) distance += levenshtein(pred, target) / max( len(pred), len(target)) # 莱温斯坦距离 n_total += 1 # 完成了一个单词 # f.close() # print and save # 跑完之后输出到日志中 for pred, gt in zip(sim_preds, cpu_texts): gt = ''.join(gt.split(opt.sep)) print('%-20s, gt: %-20s' % (pred, gt)) print("correct / total: %d / %d, " % (n_correct, n_total)) print('levenshtein distance: %f' % (distance / n_total)) accuracy = n_correct / float(n_total) log.scalar_summary('Validation/levenshtein distance', distance / n_total, steps) log.scalar_summary('Validation/loss', loss_avg.val(), steps) log.scalar_summary('Validation/accuracy', accuracy, steps) print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy)) return accuracy
def trainBatch(): # 获取一个batch的数据 [images,label] data = train_iter.next() if opt.BidirDecoder: cpu_images, cpu_texts, cpu_texts_rev = data utils.loadData(image, cpu_images) t, l = converter.encode(cpu_texts, scanned=True) t_rev, _ = converter.encode(cpu_texts_rev, scanned=True) utils.loadData(text, t) utils.loadData(text_rev, t_rev) utils.loadData(length, l) # 双向lstm有两个结果 preds0, preds1 = MORAN(image, length, text, text_rev) cost = criterion(torch.cat([preds0, preds1], 0), torch.cat([text, text_rev], 0)) else: cpu_images, cpu_texts = data utils.loadData(image, cpu_images) # 标签和每个标签的长度 t, l = converter.encode(cpu_texts, scanned=True) utils.loadData(text, t) utils.loadData(length, l) # 单向lstm一个结果 preds = MORAN(image, length, text, text_rev) cost = criterion(preds, text) MORAN.zero_grad() cost.backward() optimizer.step() return cost
converter = utils.strLabelConverterForAttention(args.alphabet, ':') pred_dataset = dataset.lmdbDataset(root=os.path.join('dataset', args.data), transform=dataset.resizeNormalize( (100, 32))) pred_loader = torch.utils.data.DataLoader(pred_dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers) image = torch.FloatTensor(args.batch_size, args.nc, args.imgH, args.imgW).cuda() text = torch.LongTensor(args.batch_size * 5).cuda() length = torch.IntTensor(args.batch_size).cuda() t, l = converter.encode(['0' * args.max_iter] * args.batch_size, scanned=True) utils.loadData(text, t) utils.loadData(length, l) f = open(os.path.join('logger', args.data + '.csv'), 'w', newline='', encoding='utf-8') writer = csv.writer(f) for i, (img_keys, cpu_images) in enumerate(pred_loader): utils.loadData(image, cpu_images) t, l = converter.encode(['0' * args.max_iter] * cpu_images.size(0), scanned=True) utils.loadData(text, t) utils.loadData(length, l) preds0, _, preds1, _ = MORAN(image, length, text, text, test=True)