Beispiel #1
0
def extract(image):

    try:
        binary = ocrolib.read_image_binary(image)
        binary = 1-binary

        scale = psegutils.estimate_scale(binary)
        segmentation = compute_segmentation(binary,scale)

        # ...lines = compute_lines(segmentation,scale)

        # compute the reading order
        lines = psegutils.compute_lines(segmentation,scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs
        nlabels = amax(compute_segmentation)+1
        renumber = zeros(nlabels,'i')
        for i,v in enumerate(lsort): renumber[lines[v].label] = 0x010000+(i+1)
        segmentation = renumber[segmentation]

        outputdir = "http://127.0.0.1:5000/uploads/"
        
        lines = [lines[i] for i in lsort]
        ocrolib.write_page_segmentation("%s.pseg.png"%outputdir,segmentation)


        cleaned = ocrolib.remove_noise(binary,args.noise)
        for i,l in enumerate(lines):
            binline = psegutils.extract_masked(1-cleaned,l,pad=args.pad,expand=args.expand)
            ocrolib.write_image_binary("%s/01%04x.bin.png"%(outputdir,i+1),binline)
        #print "%6d"%i,fname,"%4.1f"%scale,len(lines)
    except:
        print ('error')
Beispiel #2
0
def caption_segment(binary):
    '''
    :param gray:待分析的析"标题栏"
    :param bina:
    :return:
    '''
    # 排除边界处干扰部分
    bina = ocrolib.remove_noise(binary, 8)
    scale = psegutils.estimate_scale(bina)
    lines = morph.select_regions(bina, sl.dim1, min=2 * scale)
    bina = bina - lines
    bina = morph.select_regions(bina, sl.dim0, min=scale / 3)
    #扩大文本区域,连接相邻文本
    textlines = filters.maximum_filter(bina, (scale, scale / 2))
    #计算候选文本区域起始位置
    indexs_white = compute_index(textlines, th=scale / 2, n=1)
    indexs_lists = []
    if len(indexs_white) > 2:
        index_fir = indexs_white[0]
        #排除过小同时连接相邻的候选文本区域
        for i, index in enumerate(indexs_white):
            if index[1] - index[0] > scale / 2:  #排除过小
                if i != 0 and index[0] - index_fir[1] < scale / 3:  #连接相近

                    index_acc = [index_fir[0], index[1]]
                    indexs_lists.remove(index_fir)
                    indexs_lists.append(index_acc)
                    index_fir = index_acc
                else:
                    indexs_lists.append(index)
                    index_fir = index
    return indexs_lists
Beispiel #3
0
    def calc(self, objects, scale):
        if self.binpage is None:
            return
        tt = time()

        bottom, top, boxmap = compute_gradmaps(self.binpage, scale)
        #         DSHOW('hihi', [0.5*bottom+0.5*top,self.binpage, boxmap])
        seeds0 = compute_line_seeds(self.binpage, bottom, top, scale)
        seeds, _ = morph.label(seeds0)

        llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
        spread = spread_labels(seeds, maxdist=scale)
        llabels = where(llabels > 0, llabels, spread * self.binpage)
        segmentation = llabels * self.binpage
        self.binpage = ocrolib.remove_noise(self.binpage, args.noise)
        lines = psegutils.compute_lines(segmentation, scale)
        binpage_reversed = 1 - self.binpage
        #         print 'pre line ', time() - tt
        tt = time()
        self.lines = []
        for i, l in enumerate(lines):
            tt = time()
            binline = psegutils.extract_masked(
                binpage_reversed, l, pad=args.pad,
                expand=args.expand)  # black text
            binline = (1 - binline)
            le = lineest.CenterNormalizer(binline.shape[0])  # white text
            binline = binline.astype(float)
            le.measure(binline)
            binline = le.normalize(binline)
            binline = where(binline > 0.5, 0, 1)  # black text
            #             print 'line time ', time()-tt

            print '-----------------------'
            pilimg = Image.fromarray((binline * 255).astype(uint8))
            pred_legacy = pytesseract.image_to_string(pilimg,
                                                      lang='eng',
                                                      config='--oem 0 --psm 7')
            print '00', pred_legacy
            pred_lstm = pytesseract.image_to_string(pilimg,
                                                    lang='eng',
                                                    config='--oem 1 --psm 7')
            print '11', pred_lstm
            #             ASHOW('line',binline, scale=2.0)
            ##             pred_both = pytesseract.image_to_string(pilimg,lang='vie', config='--oem 2 --psm 7')
            ##             print '22', pred_both
            result = psegutils.record(bounds=l.bounds,
                                      text1=pred_legacy,
                                      text2=pred_lstm,
                                      img=binline)
            self.lines.append(result)
Beispiel #4
0
    def printResult(self, outputfile):
        # Some pre-process
        #         print 'text area before'
        #         cv2.imshow('patch', self.patch)
        #         cv2.waitKey(-1)
        if self.name == 'CMND cu - 9 so':
            k = 0.45
        else:
            k = 0.33
        patch = sharpen(self.patch)
        binary = sauvola(patch,
                         w=int(self.template.shape[1] / 24.5 * 2),
                         k=k,
                         scaledown=0.5,
                         reverse=True)
        binary = cv2.bitwise_and(binary, binary, mask=self.patch_mask)
        #         print 'text area after'
        #         cv2.imshow('patch', binary*255)
        #         cv2.waitKey(-1)
        dotremoved = binary
        scale = self.scale
        # Line extraction copied  from Ocropus source code
        bottom, top, boxmap = compute_gradmaps(dotremoved, scale)
        seeds0 = compute_line_seeds(dotremoved, bottom, top, scale)
        seeds, _ = morph.label(seeds0)

        llabels = morph.propagate_labels(boxmap, seeds, conflict=0)
        spread = spread_labels(seeds, maxdist=scale)
        llabels = where(llabels > 0, llabels, spread * dotremoved)
        segmentation = llabels * dotremoved
        dotremoved = ocrolib.remove_noise(dotremoved, 8)
        lines = psegutils.compute_lines(segmentation, scale / 2)
        binpage_reversed = 1 - dotremoved

        self.lines = []
        readrs = dict.fromkeys(self.linepos1.keys(), u'')
        lines = sorted(lines, key=lambda x: x.bounds[1].start)
        for i, l in enumerate(lines):
            # Line extraction copied from Ocropus source code
            binline = psegutils.extract_masked(binpage_reversed,
                                               l,
                                               pad=int(scale / 2),
                                               expand=0)  # black text
            binline = (1 - binline)
            le = lineest.CenterNormalizer(binline.shape[0])  # white text
            binline = binline.astype(float)
            le.measure(binline)
            binline = le.normalize(binline)
            #             print 'normalized'
            #             cv2.imshow('line', binline)
            #             cv2.waitKey(-1)
            binline = cv2.resize(binline, None, fx=2.0, fy=2.0)
            #             print 'resized'
            #             cv2.imshow('line', binline)
            #             cv2.waitKey(-1)
            binline = where(binline > 0.5, uint8(0), uint8(255))  # black text
            #             print 'black text'
            #             cv2.imshow('line', binline)
            #             cv2.waitKey(-1)
            #             pilimg = Image.fromarray(binline)
            pos = l.bounds[0].stop
            left = (l.bounds[1].start < self.template.shape[1] / 2)
            # Prediction using Tesseract 4.0
            if pos > self.linepos1['idNumber'][0] and pos < self.linepos1[
                    'idNumber'][1]:  #ID, all numbers
                pred = ocr(
                    binline,
                    config=
                    '--oem 0 --psm 7 -c tessedit_char_whitelist=0123456789')
                readrs['idNumber'] += pred + u' '
            elif pos > self.linepos1['dateOfBirth'][0] and pos < self.linepos1[
                    'dateOfBirth'][1]:  # DOB, number, - , /
                pred = ocr(
                    binline,
                    config=
                    '--oem 1 --psm 7 -c tessedit_char_whitelist=0123456789-/')
                readrs['dateOfBirth'] += pred + u' '
            elif left and pos > self.linepos1['Gender'][
                    0] and pos < self.linepos1['Gender'][1]:
                pred = ocr(binline, config='--oem 1 --psm 7 -l vie')
                readrs['Gender'] += pred + u' '
            elif (not left) and pos > self.linepos1['Dantoc'][
                    0] and pos < self.linepos1['Dantoc'][1]:
                pred = ocr(binline, config='--oem 1 --psm 7 -l vie')
                readrs['Dantoc'] += pred + u' '
            elif pos > self.linepos1['NguyenQuan'][0] and pos < self.linepos1[
                    'NguyenQuan'][1]:
                pred = ocr(binline, config='--oem 1 --psm 7 -l vie')
                readrs['NguyenQuan'] += pred + u' '
            elif pos > self.linepos1['fullName'][0] and pos < self.linepos1[
                    'fullName'][1]:
                pred = ocr(binline, config='--oem 1 --psm 7 -l vie')
                readrs['fullName'] += pred + u' '
#             else:
#                 pred = ocr(binline, config='--oem 1 --psm 7 -l vie')
#                 print 'unknown ', unicode2ascii(pred), 'y:', l.bounds[0], 'x:', l.bounds[1]

        for k in readrs:
            readrs[k] = (readrs[k].replace(u'²',
                                           u'2').replace(u'º', u'o').replace(
                                               u'»', u'-')).strip()
            if len(readrs[k]) == 0:
                readrs[k] = None
        if self.name == 'CMND moi - 12 so':
            readrs['type'] = 'CMND Mới - 12 Số'
        elif self.name == 'Can Cuoc Cong Dan':
            readrs['type'] = 'Căn Cước Công Dân'
        elif self.name == 'CMND cu - 9 so':
            readrs['type'] = 'CMND Cũ - 9 Số'

        readrs['NgayHetHan'] = None

        outputfile.write(json.dumps(readrs))
    def _process_segment(self, page_image, page, textregion, region_xywh,
                         page_id, input_file, n):
        LOG = getLogger('OcrdAnybaseocrTextline')
        #check for existing text lines and whether to overwrite them
        if textregion.get_TextLine():
            if self.parameter['overwrite']:
                LOG.info('removing existing TextLines in region "%s"', page_id)
                textregion.set_TextLine([])
            else:
                LOG.warning('keeping existing TextLines in region "%s"',
                            page_id)
                return

        binary = ocrolib.pil2array(page_image)

        if len(binary.shape) > 2:
            binary = np.mean(binary, 2)
        binary = np.array(1 - binary / np.amax(binary), 'B')

        if self.parameter['scale'] == 0:
            scale = psegutils.estimate_scale(binary)
        else:
            scale = self.parameter['scale']

        if np.isnan(
                scale) or scale > 1000.0 or scale < self.parameter['minscale']:
            LOG.warning(str(scale) + ": bad scale; skipping!\n")
            return

        segmentation = self.compute_segmentation(binary, scale)
        if np.amax(segmentation) > self.parameter['maxlines']:
            LOG.warning("too many lines %i; skipping!\n",
                        (np.amax(segmentation)))
            return
        lines = psegutils.compute_lines(segmentation, scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs

        nlabels = np.amax(segmentation) + 1
        renumber = np.zeros(nlabels, 'i')
        for i, v in enumerate(lsort):
            renumber[lines[v].label] = 0x010000 + (i + 1)
        segmentation = renumber[segmentation]

        lines = [lines[i] for i in lsort]
        cleaned = ocrolib.remove_noise(binary, self.parameter['noise'])

        for i, l in enumerate(lines):
            #LOG.info('check this: ')
            #LOG.info(type(l.bounds))
            #LOG.info(l.bounds)
            #line_points = np.where(l.mask==1)
            #hull = MultiPoint([x for x in zip(line_points[0],line_points[1])]).convex_hull
            #x,y = hull.exterior.coords.xy
            #LOG.info('hull coords x: ',x)
            #LOG.info('hull coords y: ',y)

            min_x, max_x = (l.bounds[0].start, l.bounds[0].stop)
            min_y, max_y = (l.bounds[1].start, l.bounds[1].stop)

            line_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y],
                            [min_x, max_y]]

            #line_polygon = [x for x in zip(y, x)]
            line_polygon = coordinates_for_segment(line_polygon, page_image,
                                                   region_xywh)
            line_points = points_from_polygon(line_polygon)

            img = cleaned[l.bounds[0], l.bounds[1]]
            img = np.array(255 * (img > ocrolib.midrange(img)), 'B')
            img = 255 - img
            img = ocrolib.array2pil(img)

            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                img,
                file_id + "_" + str(n) + "_" + str(i),
                page_id=page_id,
                file_grp=self.output_file_grp)
            ai = AlternativeImageType(filename=file_path,
                                      comments=region_xywh['features'])
            line_id = '%s_line%04d' % (page_id, i)
            line = TextLineType(custom='readingOrder {index:' + str(i) + ';}',
                                id=line_id,
                                Coords=CoordsType(line_points))
            line.add_AlternativeImage(ai)
            textregion.add_TextLine(line)
Beispiel #6
0
def analyze_page_layout(binary, gray, rgb=None):
    hscale = 1.0  # Non-standard scaling of horizontal parameters.
    vscale = 1.0  # Non-standard scaling of vertical parameters.
    threshold = 0.2  # baseline threshold.
    usegauss = True  # Use gaussian instead of uniform.
    maxseps = 0  # Maximum black column separators.
    sepwiden = 10  # Widen black separators (to account for warping).
    blackseps = True
    maxcolseps = 3  # Maximum # whitespace column separators.
    csminheight = 10  # Minimum column height (units=scale).
    noise = 8  # Noise threshold for removing small components from lines.
    gray_output = True  # Output grayscale lines as well, which are extracted from the grayscale version of the pages.
    pad = 3  # Padding for extracted lines.
    expand = 3  # Expand mask for grayscale extraction.

    if False:
        bin_image_filepath = './ocropy_test.bin.png'
        gray_image_filepath = './ocropy_test.nrm.png'

        binary = ocrolib.read_image_binary(bin_image_filepath)
        gray = ocrolib.read_image_gray(gray_image_filepath)

    binary = 1 - binary  # Invert.

    scale = psegutils.estimate_scale(binary)
    segmentation = compute_segmentation(binary,
                                        scale,
                                        blackseps,
                                        maxseps,
                                        maxcolseps,
                                        csminheight,
                                        sepwiden,
                                        usegauss,
                                        vscale,
                                        hscale,
                                        threshold,
                                        quiet=True)

    lines = psegutils.compute_lines(segmentation, scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)

    # Renumber the labels so that they conform to the specs.
    nlabels = np.amax(segmentation) + 1
    renumber = np.zeros(nlabels, 'i')
    for i, v in enumerate(lsort):
        renumber[lines[v].label] = 0x010000 + (i + 1)
    segmentation = renumber[segmentation]  # Image.

    lines = [lines[i] for i in lsort]

    # Visualize bounding boxes.
    if False:
        if rgb is not None:
            # REF [function] >> extract_masked() in ${OCROPY_HOME}/ocrolib/psegutils.py.
            for l in lines:
                y0, x0, y1, x1 = [
                    int(x) for x in [
                        l.bounds[0].start, l.bounds[1].start, l.bounds[0].stop,
                        l.bounds[1].stop
                    ]
                ]
                cv2.rectangle(rgb, (x0, y0), (x1, y1), (0, 0, 255), 1,
                              cv2.LINE_AA)
            cv2.imshow('Image', rgb)
            cv2.waitKey(0)

    # Output everything.
    if False:
        if not os.path.exists(outputdir):
            os.mkdir(outputdir)

        ocrolib.write_page_segmentation("%s.pseg.png" % outputdir,
                                        segmentation)
        cleaned = ocrolib.remove_noise(binary, noise)
        for i, l in enumerate(lines):
            binline = psegutils.extract_masked(1 - cleaned,
                                               l,
                                               pad=pad,
                                               expand=expand)  # Image.
            ocrolib.write_image_binary(
                "%s/01%04x.bin.png" % (outputdir, i + 1), binline)
            if gray_output:
                grayline = psegutils.extract_masked(gray,
                                                    l,
                                                    pad=pad,
                                                    expand=expand)  # Image.
                ocrolib.write_image_gray(
                    "%s/01%04x.nrm.png" % (outputdir, i + 1), grayline)
Beispiel #7
0
def processPngFile(outRoot, origFile, fileNum):
    baseName = os.path.basename(origFile)
    baseBase, _ = os.path.splitext(baseName)
    outDir = os.path.join(outRoot, "%s.%03d" % (baseBase, fileNum))
    inFile = os.path.join(outDir, baseName)

    os.makedirs(outDir, exist_ok=True)
    shutil.copy(origFile, inFile)

    inBase, _ = ocrolib.allsplitext(inFile)
    print("**  inBase=%s" % inBase)
    # print("** binBase=%s" % binBase)

    fname = inFile
    outputdir = inBase
    binFile = inBase + ".bin.png"
    outFile = inBase + ".out.png"
    outRoot2, outDir2 = os.path.split(outRoot)
    outFile2 = os.path.join(outRoot2, "%s.out" % outDir2, baseName)
    print("outFile2=%s" % outFile2)
    # assert False
    grayFile = inBase + ".nrm.png"
    psegFile = inBase + ".pseg.png"
    print("  inFile=%s" % inFile)
    print(" binFile=%s" % binFile)
    print("grayFile=%s" % grayFile)
    print(" outFile=%s" % outFile)
    assert inFile and binFile
    assert outFile != inFile
    assert outFile != binFile

    if not binarize(inFile, binFile, grayFile):
        binExists = os.path.exists(binFile)
        print("Couldn't binarize inFile=%s binFile=%s exists=%s" %
              (inFile, binFile, binExists))
        return False

    binary = ocrolib.read_image_binary(binFile)
    print("$$ %s=%s" % (binFile, desc(binary)))
    height, width = binary.shape
    checktype(binary, ABINARY2)
    check = check_page(np.amax(binary) - binary)
    if check is not None:
        print("%s SKIPPED %s (use -n to disable this check)" % (inFile, check))
        return False

    # if args.gray:
    #     if os.path.exists(base+".nrm.png"):
    #         gray = ocrolib.read_image_gray(base+".nrm.png")
    #         checktype(gray, GRAYSCALE)
    #     else:
    #         print_error("Grayscale version %s.nrm.png not found. Use ocropus-nlbin for creating " +
    #                     "normalized grayscale version of the pages as well." % base)
    #         return

    binary = 1 - binary  # invert

    scale = psegutils.estimate_scale(binary)
    print("scale %f" % scale)
    if np.isnan(scale) or scale > 1000.0:
        print("%s: bad scale (%g); skipping\n" % (fname, scale))
        return False

    # find columns and text lines
    print("computing segmentation")
    segmentation = compute_segmentation(binary, scale)
    if np.amax(segmentation) > maxlines:
        print("%s: too many lines %g" % (fname, np.amax(segmentation)))
        return False

    print("segmentation=%s" % desc(segmentation))
    print("number of lines %g" % np.amax(segmentation))

    # compute the reading order
    print("finding reading order")
    lines = psegutils.compute_lines(segmentation, scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)
    print("$$ lsort = %d = %s...%s" % (len(lsort), lsort[:10], lsort[-10:]))

    # renumber the labels so that they conform to the specs
    nlabels = np.amax(segmentation) + 1
    renumber = np.zeros(nlabels, 'i')
    for i, v in enumerate(lsort):
        renumber[lines[v].label] = 0x010000 + (i + 1)
    segmentation = renumber[segmentation]

    # finally, output everything
    print("writing lines")
    if not os.path.exists(outputdir):
        os.mkdir(outputdir)
    lines = [lines[i] for i in lsort]
    ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation)
    cleaned = ocrolib.remove_noise(binary, noise)
    for i, l in enumerate(lines):
        binline = psegutils.extract_masked(1 - cleaned,
                                           l,
                                           pad=pad,
                                           expand=expand)
        ocrolib.write_image_binary("%s/01%04x.bin.png" % (outputdir, i + 1),
                                   binline)
        # if args.gray:
        #     grayline = psegutils.extract_masked(
        #         gray, l, pad=args.pad, expand=args.expand)
        #     ocrolib.write_image_gray("%s/01%04x.nrm.png" % (outputdir, i+1), grayline)
    print("%6d  %s %4.1f %d" % (i, fname, scale, len(lines)))

    # to proceed, we need a pseg file and a subdirectory containing text lines
    assert os.path.exists(psegFile), "%s: no such file" % psegFile
    assert os.path.isdir(inBase), "%s: no such directory" % inBase

    # iterate through the text lines in reading order, based on the page segmentation file
    pseg = ocrolib.read_page_segmentation(psegFile)
    print("$$ %s=%s" % (psegFile, desc(pseg)))

    regions = ocrolib.RegionExtractor()
    print("$$ regions=%s" % regions)
    regions.setPageLines(pseg)

    im = Image.open(inFile)
    print("~~%s %s" % (inFile, im.size))
    print("$$ regions=%s=%s" % (regions, sorted(regions.__dict__)))
    print("$$ regions.length=%s" % regions.length())

    n = regions.length()
    for i in range(1, n):

        id = regions.id(i)
        y0, x0, y1, x1 = regions.bbox(i)
        # print("%5d: 0x%05X %s %d x %d" %
        #       (i, id, [y0, x0, y1, x1], y1 - y0, x1 - x0))

        draw = ImageDraw.Draw(im)
        draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=3)
        draw.rectangle((x0, y0, x1, y1), outline=(0, 0, 255), width=0)
        # draw.rectangle((x0, y0, x1, y1), outline=255, width=5)
        # draw.rectangle((x0, y0, x1, y1), outline=10,  width=1)
        del draw

    # write output files
    print("outFile=%s" % outFile)
    im.save(outFile, "PNG")
    print("outFile2=%s" % outFile2)
    outDir2 = os.path.dirname(outFile2)
    os.makedirs(outDir2, exist_ok=True)
    im.save(outFile2, "PNG")
    assert os.path.exists(outFile2)
    # outFile3, _ = os.path.splitext(outFile)
    # outFile3 = "%s.jpg" % outFile3
    # print("outFile3=%s" % outFile3)
    # im.save(outFile3, "JPEG")
    # assert os.path.exists(outFile3)
    return True
Beispiel #8
0
def Segment(fname, save_path):
    # pdb.set_trace()
    # 清理上次执行的缓存结果
    if os.path.exists(save_path):
        shutil.rmtree(save_path)
    os.makedirs(save_path)

    #读取图像数据
    raw = read_image_gray(fname)

    #二值化,抗旋转,抗明暗度变化
    gray_o, bina_o = Binarization(raw)

    #出现类型错误,返回分割失败标识0
    if gray_o is None and bina_o is None:
        new_fname = os.path.basename(fname)
        cv2.imwrite(os.path.join(save_path, new_fname))
        return 0

    #估计文本宽度
    bina_o = ocrolib.remove_noise(bina_o, 8)
    scale = psegutils.estimate_scale(bina_o)

    #页面分块
    block_grays, block_binas = split_columns_vertical(gray_o, bina_o, scale)

    if len(block_grays) > 2:  # 图片格式出现特殊情况,即指标栏之间均以垂直黑线分隔开,直接进行行分割
        mb_dics = {}  # 存储删除空白行记录后的所有文件记录
        mb_block = []  # 记录每块中的行列数
        for i, gray_i in enumerate(block_grays):
            mb_dic = {}
            bina_i = block_binas[i]

            #分离属性栏区域和核心指标栏
            cp_gray, cp_bina, mb_gray, mb_bina = get_caption_mainbody(
                gray_i, bina_i, scale)

            #行分割,并将结果分割结果及存储名称以字典形式存放
            mb_dic, max_row = mainbody_textline_segment(
                mb_gray, mb_bina, scale, i, 0, mb_dic)
            max_col = 0
            max_row += 1

            #表单结构化初步调整:调整空白栏
            mb_dics = modify_fname_dictionary(mb_dic, mb_dics)
            mb_block.append([i, max_col, max_row])

        #表单结构化后处理:合并多块
        res_mb_dics = modify_mainbody_display(mb_dics, mb_block)

        #去除分块标识
        res_mb_dics = add_flag(res_mb_dics)

    else:  # 正常的格式,即单块或者两块
        mb_dics = {}  # 存储删除空白行记录后的所有文件记录
        mb_block = []  # 记录每块中的行列数

        for i, gray_i in enumerate(block_grays):
            bina_i = block_binas[i]

            #分离属性栏区域和核心指标栏区域
            cp_gray, cp_bina, mb_gray, mb_bina = get_caption_mainbody(
                gray_i, bina_i, scale)

            #属性栏列方向分割,获得各属性分割位置,以列表形式存放
            cp_index_list = []
            if cp_bina is not None:
                cp_index_list = caption_segment(cp_bina)

            #核心指标栏区域列方向分割,获得各属性列分隔位置,并截取图像数据,以列表形式存放
            mb_grays, mb_binas = mainbody_segment(mb_gray, mb_bina, scale,
                                                  cp_index_list)

            ######----------"核心指标栏":文本行分割----------########
            max_col = 0  # 第一块中列数
            max_row = 0  # 第一块中最大行数
            mb_dic = {}  # 存储每块中的文件记录
            for j, bina_j in enumerate(mb_binas):
                if j > max_col:
                    max_col = j
                gray_j = mb_grays[j]

                #文本行分割,获得文本行分割结果,以字典形式存储
                mb_dic, row = mainbody_textline_segment(
                    gray_j, bina_j, scale, i, j, mb_dic)
                if row > max_row:
                    max_row = row

            max_col += 1
            max_row += 1

            if mb_dic is not {}:
                # 表单结构化初步调整:调整空白栏
                mb_dics = modify_fname_dictionary(mb_dic, mb_dics)
                mb_block.append([i, max_col, max_row])

        # 表单结构化后处理:合并多块
        res_mb_dics = modify_mainbody_display(mb_dics,
                                              mb_block)  # 存储经过显示调整的所有文件记录
        # 去除块标识
        res_mb_dics = add_flag(res_mb_dics)

    #根据字典数据保存分割结果,并返回分割成功标识1
    save_img_from_dic(save_path, res_mb_dics)
    return 1
Beispiel #9
0
def mainbody_textline_segment(gray, bina, scale, black_id, col_id, dictionary):
    '''
    :param gray: "核心指标栏"中某属性列灰度图
    :param bina: "核心指标栏"中某属性列二值图
    :param black_id: "核心指标栏"中某属性列所属块id
    :param col_id: "核心指标栏"中某属性列所属列id
    :param dictionary: 文件存储记录
    :return: 文件存储记录和此属性列所含行数
    '''

    #排除多种干扰
    bina = 1 * (gray < 0.5)
    bina = ocrolib.remove_noise(bina, 5)  #希望排除一定的噪声干扰
    scale = psegutils.estimate_scale(bina)
    height, width = gray.shape
    lines = morph.select_regions(bina, sl.dim0,
                                 min=2 * scale)  #希望排除水平方向边缘处的亮斑干扰
    bina = bina - lines
    lines = morph.select_regions(bina, sl.dim1,
                                 min=2 * scale)  #希望排除垂直方向边缘处的亮斑干扰
    bina = bina - lines

    #字符合并
    textlines = filters.maximum_filter(bina, (0, scale))
    textlines = morph.rb_erosion(textlines, (3, 0))
    textlines = morph.rb_dilation(textlines, (0, scale))

    #统计文本行位置
    textpixe_num = np.sum(textlines, axis=1)
    textpixe_num = 1 * ((1.0 * textpixe_num / scale) > 1)
    textpixe_num = list(textpixe_num)

    text_index = [i for i, a in enumerate(textpixe_num) if a == 1]
    indexs = []
    max_row = 0
    if len(text_index) > 0:
        beg_index = text_index[0]
        end_index = text_index[0]
        for i in range(1, len(text_index) - 1):
            if text_index[i] - text_index[i - 1] != 1:
                end_index = text_index[i - 1]
                indexs.append([beg_index, end_index])
                beg_index = text_index[i]
            end_index = text_index[i]
        indexs.append([beg_index, end_index])

        #选取有效的文本行
        results_indexs = []
        if len(indexs) > 0:
            for index in indexs:
                if index[1] - index[0] >= scale / 4:
                    results_indexs.append(index)

        # res_index = []
        # if len(results_indexs)>0:
        #     i=0
        #     beg_index=results_indexs[i][0]/2
        #     for i in range(len(results_indexs)-1):
        #         end_index=(results_indexs[i][1]+results_indexs[i+1][0])/2
        #         res_index.append([beg_index, end_index])
        #         beg_index = end_index
        #     if i==0:
        #         end_index = (results_indexs[i][1] + height) / 2
        #     else:
        #         end_index = (results_indexs[i+1][1] + height) / 2
        #
        #     res_index.append([beg_index,end_index])

        for row_id, index in enumerate(results_indexs):
            key = '%d.%d.%d.png' % (black_id, col_id, row_id)
            data = 255 * gray[max(0, index[0] - 5):min(height, index[1] +
                                                       5), :]
            value = name_dic(index, data)
            dictionary[key] = value
            max_row = row_id
    return dictionary, max_row
Beispiel #10
0
def mainbody_segment(gray, binary, scale, index_list):
    '''
    :param gray:
    :param binary: 待测的"核心指标栏"
    :param scale: 字符宽度, float类型
    :param index_list: "标题栏"中文本沿着水平方向所在的位置列表, list类型
    :return: 沿着空白区域分割的分割图, array类型
    '''

    # 当存在"标题栏"情况下,计算垂直空白分割位置
    def search_sep_index1(bina, th, n=2):
        '''
        :param bina:待检测图像
        :param n: 匹配的模板列数
        :return:  返回待测图中连续n列白点最少的位置,如果存在多个最少,则取两个最少中间的位置
        '''
        height, width = bina.shape
        beg_index = []
        end_index = []
        min_sum = n * height
        all_sum = np.sum(bina, axis=0)
        for i in range(0, width - 1):  #以非重复方式递进
            num_sum = sum(all_sum[i:i + n])
            if num_sum < min_sum:
                min_sum = num_sum
                beg_index = [i, i + 1]
                end_index = [i, i + 1]
            elif num_sum == min_sum:
                end_index = [i, i + 1]
        if len(beg_index) > 0 and len(end_index) > 0:
            res_index = (beg_index[1] + end_index[0]) / 2

            if np.sum(bina[:, res_index]) < th:  #白像素个数小于一定数目,才认为是真的分割位置
                return res_index
        else:
            return None

    # 当不存在"标题栏"情况下,计算垂直白色空白位置
    def search_sep_index2(bina, scale):
        '''
        :param binary:待检测的"核心指标"图,array类型
        :param scale: 字体宽度, float类型
        :return: 返回待测图中各空白区域处的中间位置, int类型
        '''
        indexs = np.sum(bina, axis=0)

        indexs = list(1.0 * indexs / scale)  # 排除噪声干扰:当某列中像素点数小于一定量时候,排除干扰
        text_index_temp = [i for i, index in enumerate(indexs)
                           if index > 1]  # 候选文本位置列表

        text_index_acct = []  # 真正的文本位置列表

        if len(text_index_temp) > 0:
            beg_index = text_index_temp[0]
            end_index = text_index_temp[0]
            for i in range(1, len(text_index_temp)):
                end_index = text_index_temp[i]
                if text_index_temp[i] - text_index_temp[
                        i - 1] > 4:  # 当文本间隔超过一定阈值时候,才认为文本从新开始
                    end_index = text_index_temp[i - 1]
                    if end_index - beg_index > scale:  # 当文本宽度大于一个字符跨度时候,才认为是真正的文本
                        text_index_acct.append([beg_index, end_index])
                    beg_index = text_index_temp[i]
            text_index_acct.append([beg_index, end_index])
        text_index_acct.sort(key=lambda x: x[0])
        res_index = []
        for i in range(len(text_index_acct) - 1):
            index = (text_index_acct[i + 1][0] + text_index_acct[i][1]) / 2
            res_index.append(index)
        return res_index

    # import pdb
    # pdb.set_trace()
    # 排除边界处干扰部分
    bina = ocrolib.remove_noise(binary, 8)
    lines = morph.select_regions(bina, sl.dim1, min=2 * scale)
    bina = bina - lines
    lines = morph.select_regions(bina, sl.dim0, min=2 * scale)
    bina = bina - lines

    # 存在"标题栏"
    if 6 > len(index_list) > 3:
        colsep_index = []
        # 线性扩张:白色区域变大
        bina_d = filters.maximum_filter(bina, (scale, scale))
        i = 0
        while len(index_list):
            # 取标题栏中对应的连续两个位置的中间值,组成新的位置,作为待测位置
            # eg:假设"标题栏"中两个相邻的文本区域,在对水平方向应的位置分别是是[x00,x01],[x10,x11],
            # 则:分割线应该出现在"核心指标栏"中水平方向[(x00+x01)/2, (x10+x11)/2]范围内.
            # TODO:此处有个bug,即当标题栏只有一个属性的情况下.该如何分割指标区域
            sep_index = None
            while sep_index is None and i < len(index_list) - 1:

                index = [(index_list[i][0] + index_list[i][1]) / 2,
                         (index_list[i + 1][0] + index_list[i + 1][1]) / 2]

                bina_i = bina_d[:, index[0]:index[1]]
                sep_index = search_sep_index1(bina_i, 10 * scale)  # 返回计算得到分割位置

                if sep_index is None:  # 意味着标题栏初始分割失败
                    index_re1 = index_list[i]
                    index_re2 = index_list[i + 1]
                    index_new = [index_re1[0], index_re2[1]]
                    index_list.remove(index_re1)
                    index_list.remove(index_re2)
                    index_list.insert(i, index_new)
                    if len(colsep_index) > 0:
                        b = colsep_index.pop()
                    if i > 0:
                        i = i - 1
            if sep_index is not None:
                sep_index = sep_index + index[0]
                colsep_index.append(sep_index)
                if i > 0:
                    i = i - 1
                    index_list.remove(index_list[i])
            else:
                index_list.remove(index_list[0])
            i += 1

    # 不存在"标题栏"
    else:
        bina_d = filters.maximum_filter(bina, (scale, scale / 2))  #改为2*scale?
        colsep_index = search_sep_index2(bina_d, scale)

    colsep_index.append(0)
    colsep_index.append(bina.shape[1])
    colsep_index.sort(key=lambda x: x)

    # 返回最终的文本位置列表
    res_index = []
    for i in range(len(colsep_index) - 1):
        beg_index = colsep_index[i]
        end_index = colsep_index[i + 1]
        res_index.append([beg_index, end_index])

    bina_lists = []
    gray_lists = []
    for index in res_index:
        gray_i = gray[:, index[0]:index[1]]
        bina_i = binary[:, index[0]:index[1]]
        gray_lists.append(gray_i)
        bina_lists.append(bina_i)
        # plt.imshow(bina_i, 'gray'), plt.show()
    return gray_lists, bina_lists
Beispiel #11
0
def process(job):
    imagepath, i = job
    global base
    base, _ = ocrolib.allsplitext(imagepath)
    outputdir = base
    imagename_base = os.path.basename(os.path.normpath(base))

    try:
        binary = ocrolib.read_image_binary(imagepath)
    except IOError:
        if ocrolib.trace: traceback.print_exc()
        print_error("cannot open either %s.bin.png or %s" % (base, imagepath))
        return

    checktype(binary, ABINARY2)

    if not args['nocheck']:
        check = check_page(amax(binary) - binary)
        if check is not None:
            print_error("%s SKIPPED %s (use -n to disable this check)" %
                        (imagepath, check))
            return

    binary = 1 - binary  # invert

    if args['scale'] == 0:
        scale = psegutils.estimate_scale(binary)
    else:
        scale = args['scale']
    print_info("scale %f" % (scale))
    if isnan(scale) or scale > 1000.0:
        print_error("%s: bad scale (%g); skipping\n" % (imagepath, scale))
        return
    if scale < args['minscale']:
        print_error("%s: scale (%g) less than --minscale; skipping\n" %
                    (imagepath, scale))
        return

    # find columns and text lines

    if not args['quiet']: print_info("computing segmentation")
    segmentation = compute_segmentation(binary, scale)
    if amax(segmentation) > args['maxlines']:
        print_error("%s: too many lines %g" % (imagepath, amax(segmentation)))
        return
    if not args['quiet']: print_info("number of lines %g" % amax(segmentation))

    # compute the reading order

    if not args['quiet']: print_info("finding reading order")
    lines = psegutils.compute_lines(segmentation, scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)

    # renumber the labels so that they conform to the specs

    nlabels = amax(segmentation) + 1
    renumber = zeros(nlabels, 'i')
    for i, v in enumerate(lsort):
        renumber[lines[v].label] = 0x010000 + (i + 1)
    segmentation = renumber[segmentation]

    # finally, output everything
    if not args['quiet']: print_info("writing lines")
    if not os.path.exists(outputdir):
        os.mkdir(outputdir)
    lines = [lines[i] for i in lsort]
    ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation)
    cleaned = ocrolib.remove_noise(binary, args['noise'])
    for i, l in enumerate(lines):
        binline = psegutils.extract_masked(1 - cleaned,
                                           l,
                                           pad=args['pad'],
                                           expand=args['expand'])
        ocrolib.write_image_binary(
            "%s/%s_01%04x.bin.png" % (outputdir, imagename_base, i + 1),
            binline)
    print_info("%6d  %s %4.1f %d" % (i, imagepath, scale, len(lines)))
    return outputdir
Beispiel #12
0
def process1(job):
    fname, i = job
    global base
    base, _ = ocrolib.allsplitext(fname)
    outputdir = base

    try:
        binary = ocrolib.read_image_binary(base + ".bin.png")
    except IOError:
        try:
            binary = ocrolib.read_image_binary(fname)
        except IOError:
            if ocrolib.trace:
                traceback.print_exc()
            print("cannot open either", base + ".bin.png", "or", fname)
            return

    checktype(binary, ABINARY2)

    if not args.nocheck:
        check = check_page(amax(binary) - binary)
        if check is not None:
            print(fname, "SKIPPED", check, "(use -n to disable this check)")
            return

    if args.gray:
        if os.path.exists(base + ".nrm.png"):
            gray = ocrolib.read_image_gray(base + ".nrm.png")
        checktype(gray, GRAYSCALE)

    binary = 1 - binary  # invert

    if args.scale == 0:
        scale = psegutils.estimate_scale(binary)
    else:
        scale = args.scale
    print("scale", scale)
    if isnan(scale) or scale > 1000.0:
        sys.stderr.write("%s: bad scale (%g); skipping\n" % (fname, scale))
        return
    if scale < args.minscale:
        sys.stderr.write("%s: scale (%g) less than --minscale; skipping\n" %
                         (fname, scale))
        return

    # find columns and text lines

    if not args.quiet:
        print("computing segmentation")
    segmentation = compute_segmentation(binary, scale)
    if amax(segmentation) > args.maxlines:
        print(fname, ": too many lines", amax(segmentation))
        return
    if not args.quiet:
        print("number of lines", amax(segmentation))

    # compute the reading order

    if not args.quiet:
        print("finding reading order")
    lines = psegutils.compute_lines(segmentation, scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)

    # renumber the labels so that they conform to the specs

    nlabels = amax(segmentation) + 1
    renumber = zeros(nlabels, 'i')
    for i, v in enumerate(lsort):
        renumber[lines[v].label] = 0x010000 + (i + 1)
    segmentation = renumber[segmentation]

    # finally, output everything

    if not args.quiet:
        print("writing lines")
    if not os.path.exists(outputdir):
        os.mkdir(outputdir)
    lines = [lines[i] for i in lsort]
    ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation)
    cleaned = ocrolib.remove_noise(binary, args.noise)
    for i, l in enumerate(lines):
        binline = psegutils.extract_masked(1 - cleaned,
                                           l,
                                           pad=args.pad,
                                           expand=args.expand)
        ocrolib.write_image_binary("%s/01%04x.bin.png" % (outputdir, i + 1),
                                   binline)
        if args.gray:
            grayline = psegutils.extract_masked(gray,
                                                l,
                                                pad=args.pad,
                                                expand=args.expand)
            ocrolib.write_image_gray("%s/01%04x.nrm.png" % (outputdir, i + 1),
                                     grayline)
    print("%6d" % i, fname, "%4.1f" % scale, len(lines))
Beispiel #13
0
    def _process_segment(self, page_image, page, region_xywh, page_id,
                         input_file, n):
        binary = ocrolib.pil2array(page_image)
        binary = np.array(1 - binary / np.amax(binary), 'B')
        if page.get_TextRegion() is None or len(page.get_TextRegion()) < 1:
            min_x, max_x = (0, binary.shape[0])
            min_y, max_y = (0, binary.shape[1])
            textregion = TextRegionType(
                Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                  (min_x, min_y, max_x, min_y, max_x, max_y,
                                   min_x, max_y)))
            page.add_TextRegion(textregion)
        else:
            textregion = page.get_TextRegion()[-1]
        ocrolib.write_image_binary("test.bin.png", binary)
        if self.parameter['scale'] == 0:
            scale = psegutils.estimate_scale(binary)
        else:
            scale = self.parameter['scale']
        if np.isnan(
                scale) or scale > 1000.0 or scale < self.parameter['minscale']:
            LOG.warning("%s: bad scale (%g); skipping\n" % (fname, scale))
            return

        segmentation = self.compute_segmentation(binary, scale)
        if np.amax(segmentation) > self.parameter['maxlines']:
            LOG.warning("%s: too many lines %i",
                        (fname, np.amax(segmentation)))
            return
        lines = psegutils.compute_lines(segmentation, scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs

        nlabels = np.amax(segmentation) + 1
        renumber = np.zeros(nlabels, 'i')
        for i, v in enumerate(lsort):
            renumber[lines[v].label] = 0x010000 + (i + 1)
        segmentation = renumber[segmentation]

        lines = [lines[i] for i in lsort]
        cleaned = ocrolib.remove_noise(binary, self.parameter['noise'])
        region_xywh['features'] += ",textline"
        for i, l in enumerate(lines):
            ocrolib.write_image_binary("test.bin.png", binary[l.bounds[0],
                                                              l.bounds[1]])
            min_x, max_x = (l.bounds[0].start, l.bounds[0].stop)
            min_y, max_y = (l.bounds[1].start, l.bounds[1].stop)

            img = binary[l.bounds[0], l.bounds[1]]
            img = np.array(255 * (img > ocrolib.midrange(img)), 'B')
            img = ocrolib.array2pil(img)

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(img,
                                                       file_id + "_" + str(i),
                                                       page_id=page_id,
                                                       file_grp=self.image_grp)
            ai = AlternativeImageType(filename=file_path,
                                      comments=region_xywh['features'])
            line = TextLineType(
                Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                  (min_x, min_y, max_x, min_y, max_x, max_y,
                                   min_x, max_y)))
            line.add_AlternativeImage(ai)
            textregion.add_TextLine(line)