Exemple #1
0
def froward_image(nets, scaled, original):
    global rec_t, ext_factor, ext_factorx

    net, net_ctc = nets
    print("nets:", nets)
    print("net: ", net)
    print("net_ctc :", net_ctc)

    img = [scaled]
    draw = img[0]

    imgo = original

    im = np.asarray(img, dtype=np.float)
    im = im / 128.0
    im = im - 1.0
    # im = im.reshape((3, im.shape[0], im.shape[1]))
    im = np.swapaxes(im, 1, 3)
    im = np.swapaxes(im, 2, 3)

    net.blobs['data'].reshape(im.shape[0], im.shape[1], im.shape[2],
                              im.shape[3])
    print("net.blobs['data'] :", net.blobs['data'])
    net.blobs['data'].data[...] = im
    print("im: ", im)
    net.reshape()
    print("net.reshape(): ", net.reshape())
    start = time.time()
    out = net.forward(start="conv1")
    end = time.time()
    seconds = end - start
    fps = 1 / seconds
    # print("loc fps:{0}".format(fps))

    boxes = out['boxes']
    print("boxes: ", boxes)
    boxes[0, 0, :, 0] *= image_size[0]
    boxes[0, 0, :, 1] *= image_size[1]
    normFactor = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])
    boxes[0, 0, :, 2] *= normFactor
    boxes[0, 0, :, 3] *= normFactor

    nms = boxes[0, 0, :, 8] != 1
    boxes = boxes[:, :, nms, :]
    print("boxes before boxes_count: ", boxes)
    boxes_count = 0
    for i in range(0, boxes.shape[2]):
        det_word = boxes[0, 0, i]
        if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.1:
            break
        boxes_count += 1

    detections_out = []

    for i in range(0, boxes_count):
        det_word = boxes[0, 0, i]
        boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]),
                det_word[4] * 180 / 3.14)
        print("boxr : this is r box,", boxr)
        box = cv2.boxPoints(boxr)
        print("box : sfter detection count,", box)

        box = np.array(box, dtype="int")
        vis.draw_box_points(draw, box, (255, 0, 0))
        bbox = cv2.boundingRect(box)
        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]
        bbox[3] += bbox[1]

        boxro = [[det_word[0], det_word[1]],
                 [det_word[2] * ext_factorx, det_word[3] * ext_factor],
                 det_word[4] * 180 / 3.14]
        boxt = get_obox(img[0], original, boxro)
        print("boxt 1 :", boxt)
        boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2])
        print("boxt 2 :", boxt)

        norm2, rot_mat = get_normalized_image(original, boxt)
        if norm2 is None:
            continue

        norm = cv2.cvtColor(norm2, cv2.COLOR_BGR2GRAY)
        print("Given Norm :", norm)

        width_scale = 32.0 / norm2.shape[0]
        width = norm.shape[1] * width_scale
        best_diff = width
        bestb = 0
        for b in range(0, len(buckets)):
            if best_diff > abs(width - buckets[b]):
                best_diff = abs(width - buckets[b])
                bestb = b

        scaled = cv2.resize(norm, (buckets[bestb], 32))

        cv2.imshow('norm2', scaled)

        imtf = np.asarray([scaled], dtype=np.float)
        imtf = np.asarray(imtf, dtype=np.float)
        delta = imtf.max() - imtf.min()
        imtf /= (delta / 2)
        imtf -= imtf.mean()
        imtf = np.reshape(imtf,
                          (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))

        net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1],
                                      imtf.shape[2], imtf.shape[3])
        net_ctc.blobs['data'].data[...] = imtf

        outctc = net_ctc.forward()
        print("outctc : ", outctc)
        ctc_f = outctc['softmax']
        print("ctc_f : ", ctc_f)

        ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3])
        labels = ctc_f.argmax(2)
        mask = labels > 2
        masked = ctc_f.max(2)[mask]
        mean_conf = np.sum(masked) / masked.shape[0]
        print("mean_conf : ", mean_conf)

        if mean_conf < 0.2:
            vis.draw_box_points(scaled, box, color=(0, 0, 0))
            continue

        if debug:
            vis.vis_square(imtf[0])

        det_text, conf, dec_s = print_seq_ext(labels[:, 0], np.sum(masked))

        if len(det_text) == 0:
            continue

        if len(det_text) < 3 and mean_conf < 0.8:
            continue
        print("detections_out: ", detections_out)
        detections_out.append((boxt, (det_text, mean_conf, int(det_word[6]))))
        continue

        splits_raw = process_splits(det_text,
                                    conf,
                                    dec_s,
                                    norm2,
                                    ctc_f,
                                    rot_mat,
                                    boxt,
                                    original,
                                    0,
                                    mean_conf,
                                    alow_non_dict=True)
        detections_out.extend(splits_raw)
        continue

        if out_raw is not None:
            out_raw.write(u"{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}\n".format( \
                'vid', box[0, 0], box[0, 1], box[1, 0], box[1, 1], \
                box[2, 0], box[2, 1], box[3, 0], box[3, 1], det_text, det_text, mean_conf).encode('utf8'))

        dec2, conf2, dec_splits = cmp_trie.decode_sofmax(
            ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
        best_dict = print_seq2(dec2[0])

        if len(best_dict) == 0:
            continue
        splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f,
                                    rot_mat, boxt, original, 1, mean_conf)
        detections_out.extend(splits_out)

    return detections_out, fps
Exemple #2
0
def process_batch(nets, optim, optim2, image_size, args):
    global it, mean_loss, mean_rec
    it += 1  # 迭代次数加一

    net, net_ctc = nets

    net = net.net
    net_ctc = net_ctc.net

    net.blobs['data'].reshape(args.batch_size, 1, image_size[1],
                              image_size[0])  # 把一个batch的输入图片reshape
    net.reshape()

    optim2.step(1)

    im = net.blobs['data'].data[...]  # shape [batch_size,1,416,416]
    draw = np.swapaxes(im, 2, 3)
    draw = np.swapaxes(draw, 1, 3)
    im_ctc = np.copy(draw)
    draw += 1
    draw *= 128
    draw = np.array(draw, dtype="uint8").copy()

    if args.debug:
        grid_step = 16
        line = 0
        while line < image_size[0]:
            cv2.line(draw[0], (0, line), (image_size[1], line),
                     (128, 128, 128))
            line += grid_step

    boxes = net.blobs['boxes'].data[...]  # shape (4, 1, 500, 15)

    word_gtob = net.blobs['gt_boxes'].data[...]  # shape  (4, 6, 1, 6)
    word_txt = net.blobs['gt_labels'].data[...]  # shape (4, 6, 1, 14)

    lines_gtob = net.blobs['line_boxes'].data[...]  # shape (4, 1, 1, 5)
    lines_txt = net.blobs['line_labels'].data[...]  # shape (4, 1, 1, 7)

    #nms = boxeso[:, 0, 0, 8] == 0
    #boxes = boxes[:, :, nms, :]

    boxes[:, 0, :, 0] *= image_size[0]
    boxes[:, 0, :, 1] *= image_size[1]
    normFactor = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])
    boxes[:, 0, :, 2] *= normFactor
    boxes[:, 0, :, 3] *= normFactor

    sum_cost = 0
    count = 0

    labels_gt = []
    labels_det = []

    gt_to_detection = {}
    net_ctc.clear_param_diffs()

    batch_buckets = []
    dummy = {}

    matched_detections = 0
    for bid in range(im.shape[0]):  # 遍历batchsize下的每一个样本

        o_image = net.layers[0].get_image_file_name(bid)
        o_image = cv2.imread(o_image, cv2.IMREAD_GRAYSCALE)
        cx = net.layers[0].get_crop(bid, 0)
        cy = net.layers[0].get_crop(bid, 1)
        cmx = net.layers[0].get_crop(bid, 2)
        cmy = net.layers[0].get_crop(bid, 3)
        o_image = o_image[cy:cmy, cx:cmx]

        boxes_count = 0
        for i in range(0, boxes.shape[2]):
            det_word = boxes[bid, 0, i]
            if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01:
                break
            boxes_count += 1

        x = [i for i in range(boxes_count)]
        #random.shuffle(x)

        bucket_images = {}
        batch_buckets.append(bucket_images)

        word_gto = word_gtob[bid]
        word_gto_txt = word_txt[bid]
        gt_count = 0
        for gt_no in range(word_gto.shape[0]):
            gt = word_gto[gt_no, :]
            gt = gt.reshape(6)
            gtnum = 1000 * bid + gt_no

            if gt[5] == -1:
                #print("ignore gt!")
                continue

            gt_count += 1

            txt = word_gto_txt[gt_no, :]
            gtbox = ((gt[0] * image_size[0], gt[1] * image_size[1]),
                     (gt[2] * normFactor,
                      gt[3] * normFactor), gt[4] * 180 / 3.14)
            gtbox = cv2.boxPoints(gtbox)

            gtbox = np.array(gtbox, dtype="int")
            rect_gt = cv2.boundingRect(gtbox)

            if rect_gt[0] == 0 or rect_gt[
                    1] == 0 or rect_gt[0] + rect_gt[2] >= image_size[
                        0] or rect_gt[1] + rect_gt[3] >= image_size[1]:
                continue

            if gt[3] * normFactor < 3:
                if args.debug:
                    pass
                    print('too small gt!')
                continue

            rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]]
            rect_gt[2] += rect_gt[0]
            rect_gt[3] += rect_gt[1]

            for i in range(0, min(100, boxes_count)):
                if math.fabs(gt[4] - det_word[4]) > math.pi / 16:
                    continue

                det_word = boxes[bid, 0, x[i], :]

                if (det_word[0] == 0
                        and det_word[1] == 0) or det_word[5] < 0.01:
                    break

                box = ((det_word[0], det_word[1]), (det_word[2], det_word[3]),
                       det_word[4] * 180 / 3.14)
                box = cv2.boxPoints(box)

                if args.debug:
                    boxp = np.array(box, dtype="int")
                    vis.draw_box_points(draw[bid], boxp, color=(0, 255, 0))

                box = np.array(box, dtype="int")
                bbox = cv2.boundingRect(box)
                bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
                bbox[2] += bbox[0]
                bbox[3] += bbox[1]

                #rectangle intersection ...
                inter = intersect(bbox, rect_gt)
                uni = union(bbox, rect_gt)
                ratio = area(inter) / float(area(uni))

                ratio_gt = area(inter) / float(area(rect_gt))
                if ratio_gt < 0.95:
                    continue

                if ratio < 0.5:
                    continue

                if not gt_to_detection.has_key(gtnum):
                    gt_to_detection[gtnum] = [0, 0, 0]
                tupl = gt_to_detection[gtnum]
                if tupl[0] < ratio:
                    tupl[0] = ratio
                    tupl[1] = x[i]
                    tupl[2] = ratio_gt

                det_word = boxes[bid, 0, x[i], :]
                box = ([det_word[0],
                        det_word[1]], [det_word[2],
                                       det_word[3]], det_word[4] * 180 / 3.14)

                boxO = get_obox(im_ctc[bid], o_image, box)
                boxO = ((boxO[0][0], boxO[0][1]), (boxO[1][0], boxO[1][1]),
                        boxO[2])
                norm2, rot_mat = get_normalized_image(o_image, boxO)
                #norm3, rot_mat = get_normalized_image(im_ctc[bid], ([det_word[0], det_word[1]], [det_word[2] * 1.2, det_word[3] * 1.1], det_word[4] * 180 / 3.14))
                if norm2 is None:
                    continue
                #if norm3 is None:
                #  continue
                #continue
                #cv2.imshow('ts', norm2)
                #cv2.imshow('ts3', norm3)
                #cv2.waitKey(1)
                width_scale = 32.0 / norm2.shape[0]
                width = norm2.shape[1] * width_scale
                best_diff = width
                bestb = 0
                for b in range(0, len(buckets)):
                    if best_diff > abs(width * 1.3 - buckets[b]):
                        best_diff = abs(width * 1.3 - buckets[b])
                        bestb = b

                scaled = cv2.resize(norm2, (buckets[bestb], 32))
                scaled = np.asarray(scaled, dtype=np.float)
                delta = scaled.max() - scaled.min()
                scaled = (scaled) / (delta / 2)
                scaled -= scaled.mean()

                if not bucket_images.has_key(bestb):
                    bucket_images[bestb] = {}
                    bucket_images[bestb]['img'] = []
                    bucket_images[bestb]['sizes'] = []
                    bucket_images[bestb]['txt'] = []
                    bucket_images[bestb]['gt_enc'] = []
                    dummy[bestb] = 1
                else:
                    if args.debug and len(bucket_images[bestb]) > 4:
                        continue
                    elif len(bucket_images[bestb]) > 32:
                        continue

                gt_labels = []
                txt_enc = ''
                for k in range(txt.shape[1]):
                    if txt[0, k] > 0:
                        if codec_rev.has_key(txt[0, k]):
                            gt_labels.append(codec_rev[txt[0, k]])
                        else:
                            gt_labels.append(3)

                        txt_enc += unichr(txt[0, k])
                    else:
                        gt_labels.append(0)

                if scaled.ndim == 3:
                    scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
                if args.debug:
                    cv2.imshow('scaled', scaled)
                bucket_images[bestb]['sizes'].append(len(gt_labels))
                bucket_images[bestb]['gt_enc'].append(gt_labels)
                bucket_images[bestb]['txt'].append(txt_enc)
                bucket_images[bestb]['img'].append(scaled)
                matched_detections += 1

    #and learn OCR
    for bucket in bucket_images.keys():

        imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float)
        imtf = np.reshape(imtf,
                          (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))
        #imtf = imtf.reshape((imtf.shape[0], imtf.shape[1], imtf.shape[2], 1))
        #imtf = np.swapaxes(imtf,1,3)

        net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1],
                                      imtf.shape[2], imtf.shape[3])
        net_ctc.blobs['data'].data[...] = imtf

        labels = bucket_images[bucket]['gt_enc']
        txt = bucket_images[bucket]['txt']

        max_len = 0
        for l in range(0, len(labels)):
            max_len = max(max_len, len(labels[l]))
        for l in range(0, len(labels)):
            while len(labels[l]) < max_len:
                labels[l].append(0)

        labels = np.asarray(labels, np.float)

        net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1])

        net_ctc.blobs['label'].data[...] = labels

        if args.debug:
            vis.vis_square(imtf[0])
            cv2.imshow('draw', draw[0])
            cv2.waitKey(5)

        #optim.step(1)
        sum_cost += net_ctc.blobs['loss'].data[...]
        if net_ctc.blobs['loss'].data[...] > 10:
            #vis.vis_square(imtf[0])
            #cv2.imshow('draw', draw[0])
            sf = net_ctc.blobs['transpose'].data[...]
            labels2 = sf.argmax(3)
            out = utils.print_seq(labels2[:, 0, :])
            print(u'{0} --- {1}'.format(out, txt[0]))
            #cv2.waitKey(5)

        count += imtf.shape[0]

    correct_cout = 0
    for i in range(len(labels_gt)):
        det_text = labels_det[i]
        gt_text = labels_gt[i]

        if it % 100 == 0:
            pass
            #print( u"{0} -- {1}".format(det_text, gt_text).encode('utf8') )
        if det_text == gt_text:
            correct_cout += 1

    count = max(count, 1)
    mean_loss = 0.99 * mean_loss + 0.01 * sum_cost / count
    mean_rec = mean_rec * 0.99 + 0.01 * correct_cout / float(
        max(1, len(labels_gt)))

    #count detection ratio

    tp = 0
    for bid in range(im.shape[0]):
        word_gto = word_gtob[bid]
        for gt_no in range(len(word_gto)):
            gt = word_gto[gt_no]
            gtnum = 1000 * bid + gt_no
            if gt_to_detection.has_key(gtnum):
                tupl = gt_to_detection[gtnum]
                if tupl[0] > 0.5:
                    tp += 1

    loc_recall = tp / float(max(1, gt_count))

    if it % 10 == 0:
        print(
            '{0} - lr:{1:.3e} ctc:{2:.4f}/{3:.4f} wr:{4:.2f}/{5:.2f}, loc:{6:.2f} {7}'
            .format(it, 0.0001, sum_cost / count, mean_loss,
                    correct_cout / float(max(1, len(labels_gt))), mean_rec,
                    loc_recall, matched_detections))

    if it % snapshot_interval == 0:
        #optim.snapshot()
        optim2.snapshot()
Exemple #3
0
def process_batch(nets, optim, optim2, image_size, args):
  global it, mean_loss, mean_rec
  
  net, net_ctc = nets
  
  net = net.net
  net_ctc = net_ctc.net
  
  
  net.blobs['data'].reshape(args.batch_size,1,image_size[1],image_size[0])
  net.reshape()
      
  it += 1 
  
  optim2.step(1)
  
  im = net.blobs['data'].data[...]
  draw = np.swapaxes(im,2,3)
  draw = np.swapaxes(draw,1,3)
  im_ctc = np.copy(draw)
  draw += 1
  draw *= 128
  draw = np.array(draw, dtype="uint8").copy() 
  
  
  if args.debug:
    grid_step = 16
    line = 0
    while line < image_size[0]:
      cv2.line(draw[0], (0, line), (image_size[1], line), (128, 128, 128))
      line += grid_step
  
  boxes  =  net.blobs['boxes'].data[...]
                 
  word_gtob = net.blobs['gt_boxes'].data[...]
  word_txt = net.blobs['gt_labels'].data[...]
  
  lines_gtob = net.blobs['line_boxes'].data[...]
  lines_txt = net.blobs['line_labels'].data[...]
  
  #nms = boxeso[:, 0, 0, 8] == 0
  #boxes = boxes[:, :, nms, :]
  
  boxes[:, 0, :, 0] *= image_size[0]
  boxes[:, 0, :, 1] *= image_size[1]
  normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0])
  boxes[:, 0, :, 2] *= normFactor
  boxes[:, 0, :, 3] *= normFactor
  
  sum_cost = 0
  count = 0
  
  labels_gt = []
  labels_det = []
  
  gt_to_detection = {}
  net_ctc.clear_param_diffs()
  
  
  batch_buckets = []    
  dummy = {} 
  
  matched_detections = 0
  for bid in range(im.shape[0]):
    
    o_image = net.layers[0].get_image_file_name(bid)
    o_image = cv2.imread(o_image, cv2.IMREAD_GRAYSCALE)
    cx = net.layers[0].get_crop(bid, 0)
    cy = net.layers[0].get_crop(bid, 1)
    cmx = net.layers[0].get_crop(bid, 2)
    cmy = net.layers[0].get_crop(bid, 3)
    o_image = o_image[cy:cmy, cx:cmx]
    
    boxes_count = 0
    for i in range(0, boxes.shape[2]):
      det_word = boxes[bid, 0, i]
      if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01:
          break
      boxes_count += 1
        
    x = [i for i in range(boxes_count)]
    #random.shuffle(x)
    
    bucket_images = {}
    batch_buckets.append(bucket_images)
    
    word_gto = word_gtob[bid]
    word_gto_txt = word_txt[bid]
    gt_count = 0 
    for gt_no in range(word_gto.shape[0]):
      gt = word_gto[gt_no, :]
      gt = gt.reshape(6)
      gtnum = 1000 * bid +  gt_no
      
      if gt[5] == -1:
        #print("ignore gt!")
        continue
      
      gt_count += 1
                  
      txt = word_gto_txt[gt_no, :]
      gtbox  = ((gt[0] * image_size[0], gt[1] * image_size[1]), (gt[2] * normFactor, gt[3] * normFactor), gt[4] * 180 / 3.14)
      gtbox = cv2.boxPoints(gtbox)
      
      gtbox = np.array(gtbox, dtype="int")
      rect_gt = cv2.boundingRect(gtbox)

      if rect_gt[0] == 0 or rect_gt[1] == 0 or  rect_gt[0] + rect_gt[2]  >= image_size[0] or rect_gt[1] + rect_gt[3]  >= image_size[1]:
        continue
      
      if gt[3] * normFactor <  3:
        if args.debug:
          #print('too small gt!')
          vis.draw_box_points(draw[bid], gtbox, color = (255, 255, 0))
          cv2.imshow('draw', draw[bid])
        continue
        
      if args.debug:
        vis.draw_box_points(draw[bid], gtbox, color = (0, 0, 0), thickness=2)
      
      #vis.draw_box_points(draw[bid], gtbox, color = (255, 255, 255))
      #cv2.imshow('draw', draw[bid])
      
      rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]]
      rect_gt[2] += rect_gt[0]
      rect_gt[3] += rect_gt[1]

      for i in range(0, min(100, boxes_count)):
        if math.fabs(gt[4] - det_word[4]) > math.pi / 16:
          continue
        
        det_word = boxes[bid, 0, x[i], :]
        
        if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01:
          break
        
        box  = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14)
        box = cv2.boxPoints(box)
        
        if args.debug:
          boxp = np.array(box, dtype="int")
          vis.draw_box_points(draw[bid], boxp, color = (0, 255, 0))
        
        box = np.array(box, dtype="int")
        bbox = cv2.boundingRect(box)
        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]
        bbox[3] += bbox[1]
   
        #rectangle intersection ... 
        inter = intersect(bbox, rect_gt)
        uni = union(bbox, rect_gt)
        ratio = area(inter) / float(area(uni))
        
        ratio_gt = area(inter) / float(area(rect_gt))
        if ratio_gt < 0.95:
          continue 
        
        if ratio < 0.5:
          continue
        
        if not gt_to_detection.has_key(gtnum):
            gt_to_detection[gtnum] = [0, 0, 0]
        tupl = gt_to_detection[gtnum] 
        if tupl[0] < ratio:
          tupl[0] = ratio 
          tupl[1] = x[i]  
          tupl[2] = ratio_gt       
        
        det_word = boxes[bid, 0, x[i], :]
        box  = ([det_word[0], det_word[1]], [det_word[2], det_word[3]], det_word[4] * 180 / 3.14)
        
        boxO = get_obox(im_ctc[bid], o_image, box)
        boxO = ((boxO[0][0], boxO[0][1]), (boxO[1][0], boxO[1][1]), boxO[2])
        norm2, rot_mat = get_normalized_image(o_image, boxO)
        #norm3, rot_mat = get_normalized_image(im_ctc[bid], ([det_word[0], det_word[1]], [det_word[2] * 1.2, det_word[3] * 1.1], det_word[4] * 180 / 3.14))
        if norm2 is None:
          continue
        #if norm3 is None:
        #  continue
        #continue
        #cv2.imshow('ts', norm2)
        #cv2.imshow('ts3', norm3)
        #cv2.waitKey(1)
        width_scale = 32.0 / norm2.shape[0]
        width = norm2.shape[1] * width_scale
        best_diff = width
        bestb = 0
        for b in range(0, len(buckets)):
          if best_diff > abs(width * 1.3 - buckets[b]):
            best_diff = abs(width * 1.3 - buckets[b])
            bestb = b
        
        scaled = cv2.resize(norm2, (buckets[bestb], 32))  
        scaled = np.asarray(scaled, dtype=np.float)
        delta = scaled.max() - scaled.min()
        scaled = (scaled) / (delta / 2)
        scaled -= scaled.mean()
                
        if not bucket_images.has_key(bestb):
          bucket_images[bestb] = {}
          bucket_images[bestb]['img'] = []  
          bucket_images[bestb]['sizes'] = []    
          bucket_images[bestb]['txt'] = []
          bucket_images[bestb]['gt_enc'] = []
          dummy[bestb] = 1
        else:
          if args.debug and len(bucket_images[bestb]) > 4:
            continue    
          elif  len(bucket_images[bestb]) > 32:
            continue
        
        gt_labels = []
        txt_enc = ''
        for k in range(txt.shape[1]):
          if txt[0, k] > 0:
            if codec_rev.has_key(txt[0, k]):                
              gt_labels.append( codec_rev[txt[0, k]] )
            else:
              gt_labels.append( 3 )
                              
            txt_enc += unichr(txt[0, k])
          else:
            gt_labels.append( 0 )
        
        if scaled.ndim == 3:
          scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
        if args.debug:
          cv2.imshow('scaled', scaled)
        bucket_images[bestb]['sizes'].append(len(gt_labels))
        bucket_images[bestb]['gt_enc'].append(gt_labels)
        bucket_images[bestb]['txt'].append(txt_enc)
        bucket_images[bestb]['img'].append(scaled)
        matched_detections += 1   
      
  #and learn OCR
  for bucket in bucket_images.keys():
      
    imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float)
    imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))    
    #imtf = imtf.reshape((imtf.shape[0], imtf.shape[1], imtf.shape[2], 1))
    #imtf = np.swapaxes(imtf,1,3)
    
    
    net_ctc.blobs['data'].reshape(imtf.shape[0],imtf.shape[1],imtf.shape[2], imtf.shape[3]) 
    net_ctc.blobs['data'].data[...] = imtf
    
    labels = bucket_images[bucket]['gt_enc']
    txt = bucket_images[bucket]['txt']
    
    max_len = 0
    for l in range(0, len(labels)):
      max_len = max(max_len, len(labels[l]))
    for l in range(0, len(labels)):
      while len(labels[l]) <  max_len:
        labels[l].append(0)
      
    
    labels = np.asarray(labels, np.float)
    
    net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1])
    
    net_ctc.blobs['label'].data[...] = labels
    
    if args.debug:
        vis.vis_square(imtf[0])
        cv2.imshow('draw', draw[0])
        cv2.waitKey(5)
         
     
    optim.step(1)  
    sum_cost += net_ctc.blobs['loss'].data[...]
    if net_ctc.blobs['loss'].data[...] > 10:
      vis.vis_square(imtf[0])
      cv2.imshow('draw', draw[0])
      sf = net_ctc.blobs['transpose'].data[...]
      labels2 = sf.argmax(3)
      out = utils.print_seq(labels2[:, 0, :])
      print(u'{0} - {1}'.format(out, txt[0])  )
      cv2.waitKey(5)
          
          
    count += imtf.shape[0]
              
  correct_cout = 0    
  for i in range(len(labels_gt)):
    det_text = labels_det[i]
    gt_text = labels_gt[i]
    
    if it % 100 == 0:
      print( u"{0} - {1}".format(det_text, gt_text).encode('utf8') )
    if det_text == gt_text:
      correct_cout += 1
      
  count = max(count, 1)    
  mean_loss = 0.99 * mean_loss + 0.01 * sum_cost / count
  mean_rec = mean_rec * 0.99 + 0.01 * correct_cout / float(max(1, len(labels_gt)))
  
  #count detection ratio

  tp = 0
  for bid in range(im.shape[0]):
    word_gto = word_gtob[bid]
    for gt_no in range(len(word_gto)):
      gt = word_gto[gt_no]
      gtnum = 1000 * bid +  gt_no
      if gt_to_detection.has_key(gtnum):
        tupl = gt_to_detection[gtnum] 
        if tupl[0] > 0.5:
          tp += 1
          
                      
  loc_recall = tp / float(max(1, gt_count))             
  if args.debug:
    cv2.imshow('draw', draw[0])
    if im.shape[0] > 1:
        cv2.imshow('draw2', draw[1])
        
    cv2.waitKey(10)
  
  if it % 10 == 0:
    print('{0} - lr:{1:.3e} ctc:{2:.4f}/{3:.4f} wr:{4:.2f}/{5:.2f}, loc:{6:.2f} {7}'.format(it, 0.0001, sum_cost / count, mean_loss, correct_cout / float(max(1, len(labels_gt))), mean_rec, loc_recall, matched_detections))
  
  if it % 1000 == 0:
    optim.snapshot()
    optim2.snapshot()
Exemple #4
0
def ocr_detections(net_ctc,
                   img,
                   scaled_img,
                   boxes,
                   image_size,
                   r_p_th,
                   out_raw,
                   baseName,
                   debug,
                   split_words,
                   alow_non_dict=False):

    global rec_t, ext_factor, use_per_image

    draw = np.copy(scaled_img)

    # Region layer returns normalized coordiates, convert the generated boxes to image coordinate system
    boxes[0, 0, :, 0] *= image_size[0]
    boxes[0, 0, :, 1] *= image_size[1]
    normFactor = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])
    boxes[0, 0, :, 2] *= normFactor
    boxes[0, 0, :, 3] *= normFactor

    nms_mask = boxes[0, 0, :, 8] != 1
    boxes = boxes[:, :, nms_mask, :]

    # Region layer returns boxes in sorted order by r_{p}, filter out the boxes with r_{p} below threshold value
    boxes_count = 0
    for i in range(0, boxes.shape[2]):
        det_word = boxes[0, 0, i]
        if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < r_p_th:
            break
        boxes_count += 1

    detections_out = []

    for i in range(0, boxes_count):

        det_word = boxes[0, 0, i]
        boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]),
                det_word[4] * 180 / 3.14
                )  # Convert the rotation parameter to degrees
        box = cv2.boxPoints(
            boxr)  # Gives the coordinates for 4 points of bounding-box
        box = np.array(box, dtype="int")

        if det_word[3] < 5:
            continue

        if debug:
            try:
                vis.draw_box_points(
                    draw, box,
                    (255, 0, 0))  # Visualize the predicted bounding-boxes
            except:
                pass

        bbox = cv2.boundingRect(box)
        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]  # Convert width to right-coordinate
        bbox[3] += bbox[1]  # Convert height to bottom-coordinate

        boxro = [
            [det_word[0], det_word[1]],
            [det_word[2] * ext_factorx,
             det_word[3] * ext_factor], det_word[4] * 180 / 3.14
        ]  # Re-scaling the bounding-box parameters to increase height and width, this helps recognizer
        boxt = get_obox(
            scaled_img, img,
            boxro)  # Rescale the predicted bounding box to original image size
        boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2])

        norm2, rot_mat = get_normalized_image(
            img, boxt
        )  # norm2 stores normalized cropped region from original image determined by predicted bounding box
        if norm2 is None:
            continue
        #boxt[2] = boxt[2] * 180 / 3.14
        #cv2.imshow('norm2', norm2)
        #cv2.imshow('draw', draw)
        if norm2.ndim > 2:
            norm = cv2.cvtColor(
                norm2, cv2.COLOR_BGR2GRAY
            )  # Convert the cropped region to GRAY scale for recognizer
        else:
            norm = norm2  # Do nothing if already GRAY scale

        # Change width for each cropped region, keeping height fixed (32). Map width to closest value from bucket
        width_scale = 32.0 / norm2.shape[0]
        width = norm.shape[1] * width_scale
        best_diff = width
        bestb = 0
        for idx, val in enumerate(buckets):
            if (buckets[idx] - width) < 0:
                bestb = idx
                best_diff = abs(buckets[idx] - width) * 3
                continue
            if best_diff > (buckets[idx] - width):
                bestb = idx
                best_diff = (buckets[idx] - width)
        scaled = cv2.resize(
            norm, (buckets[bestb],
                   32))  # Resize cropped region for input for recognizer FCN

        if scaled.ndim == 3:
            scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)

        imtf = np.asarray([scaled], dtype=np.float)
        imtf = np.asarray(imtf, dtype=np.float)
        imtf /= 128.0
        imtf -= 1
        imtf = np.reshape(imtf,
                          (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))

        net_ctc.blobs['data'].reshape(
            imtf.shape[0], imtf.shape[1], imtf.shape[2], imtf.shape[3]
        )  # Reshape the recognizer FCN to adapt varying cropped region size
        net_ctc.blobs['data'].data[
            ...] = imtf  # Load the data onto recognizer FCN (cropped region data)
        net_ctc.forward()  # Recognizer FCN feed-forward
        ctc_f = net_ctc.blobs['softmax'].data[...]

        ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3])
        labels = ctc_f.argmax(
            2
        )  # 3rd dimension (ctc_f[:,:,2]) contains softmax distribution over all the possible characters for each position, thus labels store the index of character with maximum value (probability).
        mask = labels > 3
        masked = ctc_f.max(
            2
        )[mask]  # For each predicted character, fetch the corresponding score
        mean_conf = np.sum(masked) / masked.shape[
            0]  # Mean score for all the predicted characters

        # Visualize if mean score for predicted characters is less than 0.3
        if mean_conf < 0.3:
            continue

        if debug:
            vis.vis_square(imtf[0])

        det_text, conf, dec_s = print_seq_ext(labels[:, 0], np.sum(masked))
        if not split_words:
            detections_out.extend([(boxt, (det_text, mean_conf, 1, mean_conf))
                                   ])
            continue

        #print(det_text)
        #str_lm, pr =  cmp_trie.decode_sofmax_lm(ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
        #if det_text != str_lm:
        #  print('  Decoding diff: {0} - {1}'.format(det_text, str_lm))
        #  det_text = str_lm.strip()

        if len(det_text.strip()) == 0:
            continue

        if len(det_text.strip()) <= 3:
            if mean_conf < 0.6 or det_word[5] < 0.4:
                continue

        pr = 1
        for k in range(masked.shape[0]):
            pr = pr * masked[k]
        pr = math.exp(pr)
        #pr = math.pow(pr, 1.0/ len(det_text) )

        #tex_conf =  mean_conf / ctc_f.shape[0]
        #if tex_conf < 0.1:
        #  continue

        #print(det_text)
        #cv2.imshow('norm2', norm2)
        splits_raw = process_splits(
            det_text,
            conf,
            dec_s,
            norm2,
            ctc_f,
            rot_mat,
            boxt,
            img,
            det_word[5],
            mean_conf,
            alow_non_dict=alow_non_dict
        )  # Process the split and improve the localization results using "space" (' ') predicted by recognizer
        detections_out.extend(splits_raw)
        spl = det_text.split(" ")

        if len(spl) == 1 and cmp_trie.is_dict(
                spl[0].lower().encode('utf-8')) == 1:
            continue

        dec2, conf2, dec_splits = cmp_trie.decode_sofmax(
            ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
        best_dict = print_seq2(dec2[0])

        if out_raw is not None and len(det_text) > 2:
            boxout = cv2.boxPoints(boxt)
            out_raw.write(u"{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}\n".format(\
                    baseName[:-4],boxout[0, 0],boxout[0, 1], boxout[1, 0], boxout[1, 1], \
                    boxout[2, 0], boxout[2, 1], boxout[3, 0], boxout[3, 1], det_text, best_dict, mean_conf).encode('utf8'))

        splits_out = process_splits(best_dict,
                                    conf2,
                                    dec_splits,
                                    norm2,
                                    ctc_f,
                                    rot_mat,
                                    boxt,
                                    img,
                                    det_word[5],
                                    pr,
                                    alow_non_dict=False)
        detections_out.extend(splits_out)

    #detections_out = nms(detections_out)
    if out_raw is not None:
        out_raw.flush()

    cv2.imshow('draw', draw)
    cv2.waitKey(10)
    return detections_out
Exemple #5
0
def froward_image(nets, scaled, original):
    '''
  :param nets: yolo网络,ctc网络
  :param scaled:灰度reshape图片
  :param original:原始图片
  :return:
  detections_out:[( ((1181.9506549451335, 174.54442087680732), (116.45833333333334, 19.8), -2.3903521532498173), (u'FORQUEuEING', 0.885055888782848, True, 0)),()]
  (中心(x,y), (宽,高), 旋转角度)
  fps: 每秒传输帧数
  '''
    global rec_t, ext_factor, ext_factorx

    net, net_ctc = nets

    img = [scaled]

    # draw = img[0]
    # imgo = original

    im = np.asarray(img, dtype=np.float)
    im = im / 128.0
    im = im - 1.0
    #im = im.reshape((3, im.shape[0], im.shape[1]))
    im = np.swapaxes(im, 1, 3)
    im = np.swapaxes(im, 2, 3)

    net.blobs['data'].reshape(im.shape[0], im.shape[1], im.shape[2],
                              im.shape[3])
    net.blobs['data'].data[...] = im
    net.reshape()
    start = time.time()
    out = net.forward(start="conv1")
    end = time.time()
    seconds = end - start
    fps = 1 / seconds

    boxes = out['boxes']  #(1, 1, 500, 15)  500个anchor

    boxes[0, 0, :, 0] *= image_size[0]
    boxes[0, 0, :, 1] *= image_size[1]
    normFactor = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])
    boxes[0, 0, :, 2] *= normFactor
    boxes[0, 0, :, 3] *= normFactor

    nms = boxes[0, 0, :, 8] != 1
    boxes = boxes[:, :, nms, :]

    boxes_count = 0
    for i in range(0, boxes.shape[2]):
        det_word = boxes[0, 0, i]
        if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.1:
            break
        boxes_count += 1

    detections_out = []
    # 对于每一个检测出来的框(nms之后且分数大于0.1),都识别一次
    for i in range(0, boxes_count):
        det_word = boxes[0, 0, i]
        boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]),
                det_word[4] * 180 / 3.14)  # 用预测出来的 x,y h, w, angle
        box = cv2.boxPoints(boxr)  # 得到四个点的坐标

        box = np.array(box, dtype="int")
        #vis.draw_box_points(draw, box, (255, 0, 0))
        bbox = cv2.boundingRect(box)  # 变成最小矩形框, x, y, w, h
        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]
        bbox[3] += bbox[1]  # 后面也没用到bbox啊

        boxro = [[det_word[0], det_word[1]],
                 [det_word[2] * ext_factorx, det_word[3] * ext_factor],
                 det_word[4] * 180 / 3.14]
        boxt = get_obox(img[0], original, boxro)
        boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2])

        norm2, rot_mat = get_normalized_image(original, boxt)
        if norm2 is None:
            continue

        norm = cv2.cvtColor(norm2, cv2.COLOR_BGR2GRAY)

        width_scale = 32.0 / norm2.shape[0]
        width = norm.shape[1] * width_scale
        best_diff = width
        bestb = 0
        for b in range(0, len(buckets)):
            if best_diff > abs(width - buckets[b]):
                best_diff = abs(width - buckets[b])
                bestb = b

        scaled = cv2.resize(norm, (buckets[bestb], 32))

        imtf = np.asarray([scaled], dtype=np.float)
        imtf = np.asarray(imtf, dtype=np.float)
        delta = imtf.max() - imtf.min()
        imtf /= (delta / 2)
        imtf -= imtf.mean()
        imtf = np.reshape(imtf,
                          (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))

        net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1],
                                      imtf.shape[2], imtf.shape[3])
        net_ctc.blobs['data'].data[...] = imtf

        outctc = net_ctc.forward()  # ['loss', 'softmax']
        ctc_f = outctc['softmax']  # shape (48, 1, 1, 141)

        ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3])
        labels = ctc_f.argmax(2)  #(48, 1)

        mask = labels > 2
        masked = ctc_f.max(2)[mask]
        mean_conf = np.sum(masked) / masked.shape[0]

        if mean_conf < 0.2:
            vis.draw_box_points(scaled, box, color=(0, 0, 0))
            continue

        if debug:
            vis.vis_square(imtf[0])

        det_text, conf, dec_s = print_seq_ext(
            labels[:, 0], np.sum(masked))  # 得到det_text,识别出来的字

        if len(det_text) == 0:
            continue
        if len(det_text) < 3 and mean_conf < 0.8:
            continue

        splits_raw = process_splits(det_text,
                                    conf,
                                    dec_s,
                                    norm2,
                                    ctc_f,
                                    rot_mat,
                                    boxt,
                                    original,
                                    0,
                                    mean_conf,
                                    alow_non_dict=True)
        detections_out.extend(splits_raw)

        dec2, conf2, dec_splits = cmp_trie.decode_sofmax(
            ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
        best_dict = print_seq2(dec2[0])  # 这个是什么?这里得到的是 “” 所以下面就continue了

        if len(best_dict) == 0:
            continue
        splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f,
                                    rot_mat, boxt, original, 1, mean_conf)
        detections_out.extend(splits_out)

    return detections_out, fps
Exemple #6
0
def ocr_detections(net_ctc, img, scaled_img, boxes, image_size, r_p_th, out_raw, baseName, debug, split_words, alow_non_dict=False):
    
  global rec_t, ext_factor, use_per_image
    
  draw = np.copy(scaled_img)
    
  # Region layer returns normalized coordiates, convert the generated boxes to image coordinate system
  boxes[0, 0, :, 0] *= image_size[0]
  boxes[0, 0, :, 1] *= image_size[1]
  normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0])
  boxes[0, 0, :, 2] *= normFactor
  boxes[0, 0, :, 3] *= normFactor
  
  nms_mask = boxes[0, 0, :, 8] != 1
  boxes = boxes[:, :, nms_mask, :]
  
    # Region layer returns boxes in sorted order by r_{p}, filter out the boxes with r_{p} below threshold value
  boxes_count = 0
  for i in range(0, boxes.shape[2]):
      det_word = boxes[0, 0, i]
      if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < r_p_th:
        break
      boxes_count += 1
  
  detections_out = []
  
  for i in range(0, boxes_count):
      
    det_word = boxes[0, 0, i]
    boxr  = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14) # Convert the rotation parameter to degrees
    box = cv2.boxPoints(boxr) # Gives the coordinates for 4 points of bounding-box
    box = np.array(box, dtype="int")
    
    if det_word[3] < 5:
      continue
    
    if debug:
      try:
        vis.draw_box_points(draw, box, (255, 0, 0)) # Visualize the predicted bounding-boxes
      except:
        pass
    
    bbox = cv2.boundingRect(box)
    bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
    bbox[2] += bbox[0] # Convert width to right-coordinate
    bbox[3] += bbox[1] # Convert height to bottom-coordinate
    
    boxro  = [[det_word[0], det_word[1]], [det_word[2]  * ext_factorx, det_word[3] * ext_factor], det_word[4] * 180 / 3.14] # Re-scaling the bounding-box parameters to increase height and width, this helps recognizer
    boxt = get_obox(scaled_img, img, boxro) # Rescale the predicted bounding box to original image size
    boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2])
    
    norm2, rot_mat = get_normalized_image(img, boxt) # norm2 stores normalized cropped region from original image determined by predicted bounding box
    if norm2 is None:
      continue
    #boxt[2] = boxt[2] * 180 / 3.14
    #cv2.imshow('norm2', norm2)
    #cv2.imshow('draw', draw)
    if norm2.ndim > 2:
        norm = cv2.cvtColor(norm2, cv2.COLOR_BGR2GRAY ) # Convert the cropped region to GRAY scale for recognizer
    else:
        norm = norm2 # Do nothing if already GRAY scale                             
    
    # Change width for each cropped region, keeping height fixed (32). Map width to closest value from bucket
    width_scale = 32.0 / norm2.shape[0]
    width = norm.shape[1] * width_scale
    best_diff = width
    bestb = 0
    for idx, val in enumerate(buckets):
      if (buckets[idx] - width) < 0  :
          bestb = idx
          best_diff = abs(buckets[idx] - width) * 3
          continue
      if best_diff > (buckets[idx] - width): 
          bestb = idx
          best_diff = (buckets[idx] - width)
    scaled = cv2.resize(norm, (buckets[bestb], 32)) # Resize cropped region for input for recognizer FCN
         
    if scaled.ndim == 3:
      scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY) 
    
    imtf = np.asarray([scaled], dtype=np.float)
    imtf = np.asarray(imtf, dtype=np.float)
    imtf /= 128.0
    imtf -= 1
    imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2])) 
        
    net_ctc.blobs['data'].reshape(imtf.shape[0],imtf.shape[1],imtf.shape[2], imtf.shape[3]) # Reshape the recognizer FCN to adapt varying cropped region size
    net_ctc.blobs['data'].data[...] = imtf # Load the data onto recognizer FCN (cropped region data)
    net_ctc.forward() # Recognizer FCN feed-forward
    ctc_f = net_ctc.blobs['softmax'].data[...] 
    
    ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3])
    labels = ctc_f.argmax(2) # 3rd dimension (ctc_f[:,:,2]) contains softmax distribution over all the possible characters for each position, thus labels store the index of character with maximum value (probability).
    mask = labels > 3
    masked = ctc_f.max(2)[mask] # For each predicted character, fetch the corresponding score
    mean_conf = np.sum(masked) / masked.shape[0] # Mean score for all the predicted characters
    
        # Visualize if mean score for predicted characters is less than 0.3
    if mean_conf < 0.3:
      continue
    
    if debug:    
      vis.vis_square(imtf[0])
    
    det_text, conf, dec_s = print_seq_ext(labels[:, 0], np.sum(masked) ) 
    if not split_words:
      detections_out.extend( [(boxt, (det_text, mean_conf, 1, mean_conf) )] )
      continue
    
    #print(det_text)
    #str_lm, pr =  cmp_trie.decode_sofmax_lm(ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
    #if det_text != str_lm:
    #  print('  Decoding diff: {0} - {1}'.format(det_text, str_lm))
    #  det_text = str_lm.strip()
    
    if len(det_text.strip()) == 0:
      continue
    
    if len(det_text.strip()) <= 3:
      if mean_conf < 0.6 or det_word[5] < 0.4:
        continue
    
    pr = 1
    for k in range(masked.shape[0]):
      pr = pr *  masked[k]
    pr = math.exp(pr)
    #pr = math.pow(pr, 1.0/ len(det_text) )
    
    #tex_conf =  mean_conf / ctc_f.shape[0]
    #if tex_conf < 0.1:
    #  continue
    
    #print(det_text)
    #cv2.imshow('norm2', norm2)
    splits_raw = process_splits(det_text, conf, dec_s, norm2, ctc_f, rot_mat, boxt, img, det_word[5], mean_conf, alow_non_dict = alow_non_dict) # Process the split and improve the localization results using "space" (' ') predicted by recognizer
    detections_out.extend( splits_raw )
    spl = det_text.split(" ")
    
    if len(spl) == 1 and cmp_trie.is_dict(spl[0].lower().encode('utf-8')) == 1:
      continue
                  
    
    dec2, conf2, dec_splits = cmp_trie.decode_sofmax(ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
    best_dict = print_seq2(dec2[0])
    
    if out_raw is not None and len(det_text) > 2:
      boxout = cv2.boxPoints(boxt)    
      out_raw.write(u"{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}\n".format(\
              baseName[:-4],boxout[0, 0],boxout[0, 1], boxout[1, 0], boxout[1, 1], \
              boxout[2, 0], boxout[2, 1], boxout[3, 0], boxout[3, 1], det_text, best_dict, mean_conf).encode('utf8'))
  
    splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f, rot_mat, boxt, img, det_word[5], pr, alow_non_dict=False)
    detections_out.extend( splits_out )
  
  #detections_out = nms(detections_out)
  if out_raw is not None:
    out_raw.flush()   
  
  cv2.imshow('draw', draw)
  cv2.waitKey(10)  
  return detections_out