Esempio n. 1
0
def process_batch(nets, optim, optim2, image_size, args):
  global it, mean_loss, mean_rec
  
  net, net_ctc = nets
  
  net = net.net
  net_ctc = net_ctc.net
  
  
  net.blobs['data'].reshape(args.batch_size,1,image_size[1],image_size[0])
  net.reshape()
      
  it += 1 
  
  optim2.step(1)
  
  im = net.blobs['data'].data[...]
  draw = np.swapaxes(im,2,3)
  draw = np.swapaxes(draw,1,3)
  im_ctc = np.copy(draw)
  draw += 1
  draw *= 128
  draw = np.array(draw, dtype="uint8").copy() 
  
  
  if args.debug:
    grid_step = 16
    line = 0
    while line < image_size[0]:
      cv2.line(draw[0], (0, line), (image_size[1], line), (128, 128, 128))
      line += grid_step
  
  boxes  =  net.blobs['boxes'].data[...]
                 
  word_gtob = net.blobs['gt_boxes'].data[...]
  word_txt = net.blobs['gt_labels'].data[...]
  
  lines_gtob = net.blobs['line_boxes'].data[...]
  lines_txt = net.blobs['line_labels'].data[...]
  
  #nms = boxeso[:, 0, 0, 8] == 0
  #boxes = boxes[:, :, nms, :]
  
  boxes[:, 0, :, 0] *= image_size[0]
  boxes[:, 0, :, 1] *= image_size[1]
  normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0])
  boxes[:, 0, :, 2] *= normFactor
  boxes[:, 0, :, 3] *= normFactor
  
  sum_cost = 0
  count = 0
  
  labels_gt = []
  labels_det = []
  
  gt_to_detection = {}
  net_ctc.clear_param_diffs()
  
  
  batch_buckets = []    
  dummy = {} 
  
  matched_detections = 0
  for bid in range(im.shape[0]):
    
    o_image = net.layers[0].get_image_file_name(bid)
    o_image = cv2.imread(o_image, cv2.IMREAD_GRAYSCALE)
    cx = net.layers[0].get_crop(bid, 0)
    cy = net.layers[0].get_crop(bid, 1)
    cmx = net.layers[0].get_crop(bid, 2)
    cmy = net.layers[0].get_crop(bid, 3)
    o_image = o_image[cy:cmy, cx:cmx]
    
    boxes_count = 0
    for i in range(0, boxes.shape[2]):
      det_word = boxes[bid, 0, i]
      if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01:
          break
      boxes_count += 1
        
    x = [i for i in range(boxes_count)]
    #random.shuffle(x)
    
    bucket_images = {}
    batch_buckets.append(bucket_images)
    
    word_gto = word_gtob[bid]
    word_gto_txt = word_txt[bid]
    gt_count = 0 
    for gt_no in range(word_gto.shape[0]):
      gt = word_gto[gt_no, :]
      gt = gt.reshape(6)
      gtnum = 1000 * bid +  gt_no
      
      if gt[5] == -1:
        #print("ignore gt!")
        continue
      
      gt_count += 1
                  
      txt = word_gto_txt[gt_no, :]
      gtbox  = ((gt[0] * image_size[0], gt[1] * image_size[1]), (gt[2] * normFactor, gt[3] * normFactor), gt[4] * 180 / 3.14)
      gtbox = cv2.boxPoints(gtbox)
      
      gtbox = np.array(gtbox, dtype="int")
      rect_gt = cv2.boundingRect(gtbox)

      if rect_gt[0] == 0 or rect_gt[1] == 0 or  rect_gt[0] + rect_gt[2]  >= image_size[0] or rect_gt[1] + rect_gt[3]  >= image_size[1]:
        continue
      
      if gt[3] * normFactor <  3:
        if args.debug:
          #print('too small gt!')
          vis.draw_box_points(draw[bid], gtbox, color = (255, 255, 0))
          cv2.imshow('draw', draw[bid])
        continue
        
      if args.debug:
        vis.draw_box_points(draw[bid], gtbox, color = (0, 0, 0), thickness=2)
      
      #vis.draw_box_points(draw[bid], gtbox, color = (255, 255, 255))
      #cv2.imshow('draw', draw[bid])
      
      rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]]
      rect_gt[2] += rect_gt[0]
      rect_gt[3] += rect_gt[1]

      for i in range(0, min(100, boxes_count)):
        if math.fabs(gt[4] - det_word[4]) > math.pi / 16:
          continue
        
        det_word = boxes[bid, 0, x[i], :]
        
        if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01:
          break
        
        box  = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14)
        box = cv2.boxPoints(box)
        
        if args.debug:
          boxp = np.array(box, dtype="int")
          vis.draw_box_points(draw[bid], boxp, color = (0, 255, 0))
        
        box = np.array(box, dtype="int")
        bbox = cv2.boundingRect(box)
        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]
        bbox[3] += bbox[1]
   
        #rectangle intersection ... 
        inter = intersect(bbox, rect_gt)
        uni = union(bbox, rect_gt)
        ratio = area(inter) / float(area(uni))
        
        ratio_gt = area(inter) / float(area(rect_gt))
        if ratio_gt < 0.95:
          continue 
        
        if ratio < 0.5:
          continue
        
        if not gt_to_detection.has_key(gtnum):
            gt_to_detection[gtnum] = [0, 0, 0]
        tupl = gt_to_detection[gtnum] 
        if tupl[0] < ratio:
          tupl[0] = ratio 
          tupl[1] = x[i]  
          tupl[2] = ratio_gt       
        
        det_word = boxes[bid, 0, x[i], :]
        box  = ([det_word[0], det_word[1]], [det_word[2], det_word[3]], det_word[4] * 180 / 3.14)
        
        boxO = get_obox(im_ctc[bid], o_image, box)
        boxO = ((boxO[0][0], boxO[0][1]), (boxO[1][0], boxO[1][1]), boxO[2])
        norm2, rot_mat = get_normalized_image(o_image, boxO)
        #norm3, rot_mat = get_normalized_image(im_ctc[bid], ([det_word[0], det_word[1]], [det_word[2] * 1.2, det_word[3] * 1.1], det_word[4] * 180 / 3.14))
        if norm2 is None:
          continue
        #if norm3 is None:
        #  continue
        #continue
        #cv2.imshow('ts', norm2)
        #cv2.imshow('ts3', norm3)
        #cv2.waitKey(1)
        width_scale = 32.0 / norm2.shape[0]
        width = norm2.shape[1] * width_scale
        best_diff = width
        bestb = 0
        for b in range(0, len(buckets)):
          if best_diff > abs(width * 1.3 - buckets[b]):
            best_diff = abs(width * 1.3 - buckets[b])
            bestb = b
        
        scaled = cv2.resize(norm2, (buckets[bestb], 32))  
        scaled = np.asarray(scaled, dtype=np.float)
        delta = scaled.max() - scaled.min()
        scaled = (scaled) / (delta / 2)
        scaled -= scaled.mean()
                
        if not bucket_images.has_key(bestb):
          bucket_images[bestb] = {}
          bucket_images[bestb]['img'] = []  
          bucket_images[bestb]['sizes'] = []    
          bucket_images[bestb]['txt'] = []
          bucket_images[bestb]['gt_enc'] = []
          dummy[bestb] = 1
        else:
          if args.debug and len(bucket_images[bestb]) > 4:
            continue    
          elif  len(bucket_images[bestb]) > 32:
            continue
        
        gt_labels = []
        txt_enc = ''
        for k in range(txt.shape[1]):
          if txt[0, k] > 0:
            if codec_rev.has_key(txt[0, k]):                
              gt_labels.append( codec_rev[txt[0, k]] )
            else:
              gt_labels.append( 3 )
                              
            txt_enc += unichr(txt[0, k])
          else:
            gt_labels.append( 0 )
        
        if scaled.ndim == 3:
          scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
        if args.debug:
          cv2.imshow('scaled', scaled)
        bucket_images[bestb]['sizes'].append(len(gt_labels))
        bucket_images[bestb]['gt_enc'].append(gt_labels)
        bucket_images[bestb]['txt'].append(txt_enc)
        bucket_images[bestb]['img'].append(scaled)
        matched_detections += 1   
      
  #and learn OCR
  for bucket in bucket_images.keys():
      
    imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float)
    imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))    
    #imtf = imtf.reshape((imtf.shape[0], imtf.shape[1], imtf.shape[2], 1))
    #imtf = np.swapaxes(imtf,1,3)
    
    
    net_ctc.blobs['data'].reshape(imtf.shape[0],imtf.shape[1],imtf.shape[2], imtf.shape[3]) 
    net_ctc.blobs['data'].data[...] = imtf
    
    labels = bucket_images[bucket]['gt_enc']
    txt = bucket_images[bucket]['txt']
    
    max_len = 0
    for l in range(0, len(labels)):
      max_len = max(max_len, len(labels[l]))
    for l in range(0, len(labels)):
      while len(labels[l]) <  max_len:
        labels[l].append(0)
      
    
    labels = np.asarray(labels, np.float)
    
    net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1])
    
    net_ctc.blobs['label'].data[...] = labels
    
    if args.debug:
        vis.vis_square(imtf[0])
        cv2.imshow('draw', draw[0])
        cv2.waitKey(5)
         
     
    optim.step(1)  
    sum_cost += net_ctc.blobs['loss'].data[...]
    if net_ctc.blobs['loss'].data[...] > 10:
      vis.vis_square(imtf[0])
      cv2.imshow('draw', draw[0])
      sf = net_ctc.blobs['transpose'].data[...]
      labels2 = sf.argmax(3)
      out = utils.print_seq(labels2[:, 0, :])
      print(u'{0} - {1}'.format(out, txt[0])  )
      cv2.waitKey(5)
          
          
    count += imtf.shape[0]
              
  correct_cout = 0    
  for i in range(len(labels_gt)):
    det_text = labels_det[i]
    gt_text = labels_gt[i]
    
    if it % 100 == 0:
      print( u"{0} - {1}".format(det_text, gt_text).encode('utf8') )
    if det_text == gt_text:
      correct_cout += 1
      
  count = max(count, 1)    
  mean_loss = 0.99 * mean_loss + 0.01 * sum_cost / count
  mean_rec = mean_rec * 0.99 + 0.01 * correct_cout / float(max(1, len(labels_gt)))
  
  #count detection ratio

  tp = 0
  for bid in range(im.shape[0]):
    word_gto = word_gtob[bid]
    for gt_no in range(len(word_gto)):
      gt = word_gto[gt_no]
      gtnum = 1000 * bid +  gt_no
      if gt_to_detection.has_key(gtnum):
        tupl = gt_to_detection[gtnum] 
        if tupl[0] > 0.5:
          tp += 1
          
                      
  loc_recall = tp / float(max(1, gt_count))             
  if args.debug:
    cv2.imshow('draw', draw[0])
    if im.shape[0] > 1:
        cv2.imshow('draw2', draw[1])
        
    cv2.waitKey(10)
  
  if it % 10 == 0:
    print('{0} - lr:{1:.3e} ctc:{2:.4f}/{3:.4f} wr:{4:.2f}/{5:.2f}, loc:{6:.2f} {7}'.format(it, 0.0001, sum_cost / count, mean_loss, correct_cout / float(max(1, len(labels_gt))), mean_rec, loc_recall, matched_detections))
  
  if it % 1000 == 0:
    optim.snapshot()
    optim2.snapshot()
Esempio n. 2
0
def process_batch(nets, optim, optim2, image_size, args):
    global it, mean_loss, mean_rec
    it += 1  # 迭代次数加一

    net, net_ctc = nets

    net = net.net
    net_ctc = net_ctc.net

    net.blobs['data'].reshape(args.batch_size, 1, image_size[1],
                              image_size[0])  # 把一个batch的输入图片reshape
    net.reshape()

    optim2.step(1)

    im = net.blobs['data'].data[...]  # shape [batch_size,1,416,416]
    draw = np.swapaxes(im, 2, 3)
    draw = np.swapaxes(draw, 1, 3)
    im_ctc = np.copy(draw)
    draw += 1
    draw *= 128
    draw = np.array(draw, dtype="uint8").copy()

    if args.debug:
        grid_step = 16
        line = 0
        while line < image_size[0]:
            cv2.line(draw[0], (0, line), (image_size[1], line),
                     (128, 128, 128))
            line += grid_step

    boxes = net.blobs['boxes'].data[...]  # shape (4, 1, 500, 15)

    word_gtob = net.blobs['gt_boxes'].data[...]  # shape  (4, 6, 1, 6)
    word_txt = net.blobs['gt_labels'].data[...]  # shape (4, 6, 1, 14)

    lines_gtob = net.blobs['line_boxes'].data[...]  # shape (4, 1, 1, 5)
    lines_txt = net.blobs['line_labels'].data[...]  # shape (4, 1, 1, 7)

    #nms = boxeso[:, 0, 0, 8] == 0
    #boxes = boxes[:, :, nms, :]

    boxes[:, 0, :, 0] *= image_size[0]
    boxes[:, 0, :, 1] *= image_size[1]
    normFactor = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])
    boxes[:, 0, :, 2] *= normFactor
    boxes[:, 0, :, 3] *= normFactor

    sum_cost = 0
    count = 0

    labels_gt = []
    labels_det = []

    gt_to_detection = {}
    net_ctc.clear_param_diffs()

    batch_buckets = []
    dummy = {}

    matched_detections = 0
    for bid in range(im.shape[0]):  # 遍历batchsize下的每一个样本

        o_image = net.layers[0].get_image_file_name(bid)
        o_image = cv2.imread(o_image, cv2.IMREAD_GRAYSCALE)
        cx = net.layers[0].get_crop(bid, 0)
        cy = net.layers[0].get_crop(bid, 1)
        cmx = net.layers[0].get_crop(bid, 2)
        cmy = net.layers[0].get_crop(bid, 3)
        o_image = o_image[cy:cmy, cx:cmx]

        boxes_count = 0
        for i in range(0, boxes.shape[2]):
            det_word = boxes[bid, 0, i]
            if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01:
                break
            boxes_count += 1

        x = [i for i in range(boxes_count)]
        #random.shuffle(x)

        bucket_images = {}
        batch_buckets.append(bucket_images)

        word_gto = word_gtob[bid]
        word_gto_txt = word_txt[bid]
        gt_count = 0
        for gt_no in range(word_gto.shape[0]):
            gt = word_gto[gt_no, :]
            gt = gt.reshape(6)
            gtnum = 1000 * bid + gt_no

            if gt[5] == -1:
                #print("ignore gt!")
                continue

            gt_count += 1

            txt = word_gto_txt[gt_no, :]
            gtbox = ((gt[0] * image_size[0], gt[1] * image_size[1]),
                     (gt[2] * normFactor,
                      gt[3] * normFactor), gt[4] * 180 / 3.14)
            gtbox = cv2.boxPoints(gtbox)

            gtbox = np.array(gtbox, dtype="int")
            rect_gt = cv2.boundingRect(gtbox)

            if rect_gt[0] == 0 or rect_gt[
                    1] == 0 or rect_gt[0] + rect_gt[2] >= image_size[
                        0] or rect_gt[1] + rect_gt[3] >= image_size[1]:
                continue

            if gt[3] * normFactor < 3:
                if args.debug:
                    pass
                    print('too small gt!')
                continue

            rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]]
            rect_gt[2] += rect_gt[0]
            rect_gt[3] += rect_gt[1]

            for i in range(0, min(100, boxes_count)):
                if math.fabs(gt[4] - det_word[4]) > math.pi / 16:
                    continue

                det_word = boxes[bid, 0, x[i], :]

                if (det_word[0] == 0
                        and det_word[1] == 0) or det_word[5] < 0.01:
                    break

                box = ((det_word[0], det_word[1]), (det_word[2], det_word[3]),
                       det_word[4] * 180 / 3.14)
                box = cv2.boxPoints(box)

                if args.debug:
                    boxp = np.array(box, dtype="int")
                    vis.draw_box_points(draw[bid], boxp, color=(0, 255, 0))

                box = np.array(box, dtype="int")
                bbox = cv2.boundingRect(box)
                bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
                bbox[2] += bbox[0]
                bbox[3] += bbox[1]

                #rectangle intersection ...
                inter = intersect(bbox, rect_gt)
                uni = union(bbox, rect_gt)
                ratio = area(inter) / float(area(uni))

                ratio_gt = area(inter) / float(area(rect_gt))
                if ratio_gt < 0.95:
                    continue

                if ratio < 0.5:
                    continue

                if not gt_to_detection.has_key(gtnum):
                    gt_to_detection[gtnum] = [0, 0, 0]
                tupl = gt_to_detection[gtnum]
                if tupl[0] < ratio:
                    tupl[0] = ratio
                    tupl[1] = x[i]
                    tupl[2] = ratio_gt

                det_word = boxes[bid, 0, x[i], :]
                box = ([det_word[0],
                        det_word[1]], [det_word[2],
                                       det_word[3]], det_word[4] * 180 / 3.14)

                boxO = get_obox(im_ctc[bid], o_image, box)
                boxO = ((boxO[0][0], boxO[0][1]), (boxO[1][0], boxO[1][1]),
                        boxO[2])
                norm2, rot_mat = get_normalized_image(o_image, boxO)
                #norm3, rot_mat = get_normalized_image(im_ctc[bid], ([det_word[0], det_word[1]], [det_word[2] * 1.2, det_word[3] * 1.1], det_word[4] * 180 / 3.14))
                if norm2 is None:
                    continue
                #if norm3 is None:
                #  continue
                #continue
                #cv2.imshow('ts', norm2)
                #cv2.imshow('ts3', norm3)
                #cv2.waitKey(1)
                width_scale = 32.0 / norm2.shape[0]
                width = norm2.shape[1] * width_scale
                best_diff = width
                bestb = 0
                for b in range(0, len(buckets)):
                    if best_diff > abs(width * 1.3 - buckets[b]):
                        best_diff = abs(width * 1.3 - buckets[b])
                        bestb = b

                scaled = cv2.resize(norm2, (buckets[bestb], 32))
                scaled = np.asarray(scaled, dtype=np.float)
                delta = scaled.max() - scaled.min()
                scaled = (scaled) / (delta / 2)
                scaled -= scaled.mean()

                if not bucket_images.has_key(bestb):
                    bucket_images[bestb] = {}
                    bucket_images[bestb]['img'] = []
                    bucket_images[bestb]['sizes'] = []
                    bucket_images[bestb]['txt'] = []
                    bucket_images[bestb]['gt_enc'] = []
                    dummy[bestb] = 1
                else:
                    if args.debug and len(bucket_images[bestb]) > 4:
                        continue
                    elif len(bucket_images[bestb]) > 32:
                        continue

                gt_labels = []
                txt_enc = ''
                for k in range(txt.shape[1]):
                    if txt[0, k] > 0:
                        if codec_rev.has_key(txt[0, k]):
                            gt_labels.append(codec_rev[txt[0, k]])
                        else:
                            gt_labels.append(3)

                        txt_enc += unichr(txt[0, k])
                    else:
                        gt_labels.append(0)

                if scaled.ndim == 3:
                    scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
                if args.debug:
                    cv2.imshow('scaled', scaled)
                bucket_images[bestb]['sizes'].append(len(gt_labels))
                bucket_images[bestb]['gt_enc'].append(gt_labels)
                bucket_images[bestb]['txt'].append(txt_enc)
                bucket_images[bestb]['img'].append(scaled)
                matched_detections += 1

    #and learn OCR
    for bucket in bucket_images.keys():

        imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float)
        imtf = np.reshape(imtf,
                          (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))
        #imtf = imtf.reshape((imtf.shape[0], imtf.shape[1], imtf.shape[2], 1))
        #imtf = np.swapaxes(imtf,1,3)

        net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1],
                                      imtf.shape[2], imtf.shape[3])
        net_ctc.blobs['data'].data[...] = imtf

        labels = bucket_images[bucket]['gt_enc']
        txt = bucket_images[bucket]['txt']

        max_len = 0
        for l in range(0, len(labels)):
            max_len = max(max_len, len(labels[l]))
        for l in range(0, len(labels)):
            while len(labels[l]) < max_len:
                labels[l].append(0)

        labels = np.asarray(labels, np.float)

        net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1])

        net_ctc.blobs['label'].data[...] = labels

        if args.debug:
            vis.vis_square(imtf[0])
            cv2.imshow('draw', draw[0])
            cv2.waitKey(5)

        #optim.step(1)
        sum_cost += net_ctc.blobs['loss'].data[...]
        if net_ctc.blobs['loss'].data[...] > 10:
            #vis.vis_square(imtf[0])
            #cv2.imshow('draw', draw[0])
            sf = net_ctc.blobs['transpose'].data[...]
            labels2 = sf.argmax(3)
            out = utils.print_seq(labels2[:, 0, :])
            print(u'{0} --- {1}'.format(out, txt[0]))
            #cv2.waitKey(5)

        count += imtf.shape[0]

    correct_cout = 0
    for i in range(len(labels_gt)):
        det_text = labels_det[i]
        gt_text = labels_gt[i]

        if it % 100 == 0:
            pass
            #print( u"{0} -- {1}".format(det_text, gt_text).encode('utf8') )
        if det_text == gt_text:
            correct_cout += 1

    count = max(count, 1)
    mean_loss = 0.99 * mean_loss + 0.01 * sum_cost / count
    mean_rec = mean_rec * 0.99 + 0.01 * correct_cout / float(
        max(1, len(labels_gt)))

    #count detection ratio

    tp = 0
    for bid in range(im.shape[0]):
        word_gto = word_gtob[bid]
        for gt_no in range(len(word_gto)):
            gt = word_gto[gt_no]
            gtnum = 1000 * bid + gt_no
            if gt_to_detection.has_key(gtnum):
                tupl = gt_to_detection[gtnum]
                if tupl[0] > 0.5:
                    tp += 1

    loc_recall = tp / float(max(1, gt_count))

    if it % 10 == 0:
        print(
            '{0} - lr:{1:.3e} ctc:{2:.4f}/{3:.4f} wr:{4:.2f}/{5:.2f}, loc:{6:.2f} {7}'
            .format(it, 0.0001, sum_cost / count, mean_loss,
                    correct_cout / float(max(1, len(labels_gt))), mean_rec,
                    loc_recall, matched_detections))

    if it % snapshot_interval == 0:
        #optim.snapshot()
        optim2.snapshot()
Esempio n. 3
0
def process_batch( net , optim , args ):
  global it,data_batch,data_total,data_index,f_list,codec_rev,codec
  text = []
  W = []
  H = []
  bucket_images = {}
  dummy={}
  net_ctc = net.net
  #print( len(codec))
  #1 Read images and gts ( circular buffer form for small samples)   
  for img_ind in range( min(data_batch,data_total) ):
    circ_ind = (data_index + img_ind ) % data_total
    
    img_path = os.path.join( args.data_dir, f_list[circ_ind].replace('../','') )
    #print('img_ind=%d,img_path=%s' %(data_index,img_path))
    img=cv2.imread( img_path.strip() )
    if img.shape[2]==3:
      img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    if img == None:
      print("im read error")
      print("img_path:%s" %img_path)
    W = img.shape[1]
    H = img.shape[0]
    with open( img_path.strip().replace('.jpg','.txt')) as f:
      item = f.readlines()
    anns = item[0].split()
    text = anns[6].decode('utf8')
    data_index+=1    
    #2 Adjust image sizes to fixed height and variable width
    width_scale = 32.0 / H
    width = W * width_scale
           
    best_diff = width
    bestb = 0
    for b in range(0, len(buckets)):
      if best_diff > abs(width * 1.3 - buckets[b]):
        best_diff = abs(width * 1.3 - buckets[b])
        bestb = b
        
    scaled = cv2.resize(img, (buckets[bestb], 32))  
    scaled = np.asarray(scaled, dtype=np.float)
    delta = scaled.max() - scaled.min()
    #print('scaled.max=%d scaled.min=%d delta=%d mean=%d' %(scaled.max(),scaled.min(),delta, scaled.mean()))
    scaled = (scaled) / (delta / 2.0)
    scaled -= scaled.mean()
    #print( 'scaled')
    #print(scaled.shape )
    if not bucket_images.has_key(bestb):
      bucket_images[bestb] = {}
      bucket_images[bestb]['img'] = []  
      bucket_images[bestb]['sizes'] = []    
      bucket_images[bestb]['txt'] = []
      bucket_images[bestb]['gt_enc'] = []
      dummy[bestb] = 1
    gt_labels = []
    txt_enc = ''
    for k in range( len(text) ):
      t_unicode = text[k]
      if t_unicode > 0:
        if codec_rev.has_key( t_unicode ):
          gt_labels.append( codec_rev[ t_unicode ] )
        else:
          gt_labels.append( 3 )
      else:
        gt_labels.append( 0 )
    
    if scaled.ndim==3:
      print( scaled.shape )
      scaled = cv2.cvtColor(scaled, cv2.COLOR_RGB2GRAY)
    if args.debug:
      cv2.imshow('scaled', scaled)
    bucket_images[bestb]['sizes'].append(len(gt_labels))
    bucket_images[bestb]['gt_enc'].append(gt_labels)
    bucket_images[bestb]['txt'].append(text)
    bucket_images[bestb]['img'].append(scaled)
  data_index = data_index % data_total 
  
  #3 Transfer the data into the net 
  for bucket in bucket_images.keys():
    #print(bucket)  
    imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float)
    #print('imtf.shape')
    #print( imtf.shape )
    imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))        
    #print('imtf reshape')
    #print( 'imtf.shape[0]=%d imtf.shape[1]=%d imtf.shape[2]=%d imtf.shape[3]=%d' %(imtf.shape[0],imtf.shape[1],imtf.shape[2],imtf.shape[3]) )
    net_ctc.blobs['data'].reshape(imtf.shape[0],imtf.shape[1],imtf.shape[2], imtf.shape[3]) 
    net_ctc.blobs['data'].data[...] = imtf
    
    labels = bucket_images[bucket]['gt_enc']
    txt = bucket_images[bucket]['txt']
    # indentical length needed     
    max_len = 0
    for l in range(0, len(labels)):
      max_len = max(max_len, len(labels[l]))
    for l in range(0, len(labels)):
      while len(labels[l]) <  max_len:
        labels[l].append(0)
      
    
    labels = np.asarray(labels, np.float)
    
    net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1])
    
    net_ctc.blobs['label'].data[...] = labels    
    #4 Compute forward-backward
    #optim.step(1)
    net.step(1)
    it +=1
    #5 If loss is large, print it
    #if net_ctc.blobs['loss'].data[...] > 10:
    sf = net_ctc.blobs['transpose'].data[...]
    #print( 'sf.shape' )
    #print(sf.shape)
    labels2 = sf.argmax(3)
    out = utils.print_seq(labels2[:,0, :])
    print(u'{0} <--> {1}'.format(out, txt[0])  )

    if it%snapshot_interval == 0:
      #optim.snapshot()
      net.snapshot()
      print ( 'it is %d, and snapshot_interval is %d' %(it,snapshot_interval) )
      print( 'snapshot saved')