def process(self): """ Segment with ocropy """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) page_width = pcgts.get_Page().get_imageWidth() page_height = pcgts.get_Page().get_imageHeight() # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) binary = ocrolib.read_image_binary( self.workspace.download_url(image_url)) binary = 1 - binary scale = self.parameter['scale'] if self.parameter[ 'scale'] != 0 else psegutils.estimate_scale(binary) log.debug(binary) pseg = self.compute_segmentation(binary, scale) log.debug("pseg=%s", pseg) # TODO reading order / enumber # log.debug("finding reading order") # lines = psegutils.compute_lines(pseg, scale) # order = psegutils.reading_order([l.bounds for l in lines]) # lsort = psegutils.topsort(order) regions = ocrolib.RegionExtractor() regions.setPageLines(pseg) dummyRegion = TextRegionType( id="dummy", Coords=CoordsType( points="0,0 %s,0 %s,%s 0,%s" % (page_width, page_width, page_height, page_height))) pcgts.get_Page().add_TextRegion(dummyRegion) for lineno in range(1, regions.length()): log.debug("id=%s bbox=%s", regions.id(lineno), regions.bbox(lineno)) textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType( points=points_from_y0x0y1x1(regions.bbox(lineno)))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts))
class OcropusSegmentPageBase(OcropusBase, base.JSONWriterMixin): """ Segment an image using Ocropus. """ abstract = True stage = stages.PAGE_SEGMENT intypes = [ocrolib.numpy.ndarray] outtype = dict def null_data(self): """ Return an empty list when ignored. """ return dict(columns=[], lines=[], paragraphs=[]) def process(self, input): """ Segment a binary image. input: a binary image. return: a dictionary of box types: lines paragraphs columns images """ out = dict(bbox=[0, 0, input.shape[1], input.shape[0]], columns=[], lines=[], paragraphs=[]) try: page_seg = self._comp.segment(input) except (IndexError, TypeError, ValueError), err: raise OcropusNodeError(err.message, self) regions = ocrolib.RegionExtractor() exfuncs = dict(lines=regions.setPageLines, paragraphs=regions.setPageParagraphs) # NB: These coordinates are relative to the TOP of the page # for some reason for box, func in exfuncs.iteritems(): func(page_seg) for i in range(1, regions.length()): out[box].append((regions.x0(i), regions.y0(i), regions.x1(i), regions.y1(i))) return out
def __init__(self, *args, **kwargs): super(SegmentPageManual, self).__init__(*args, **kwargs) self._regions = ocrolib.RegionExtractor() self._segmenter = ocrolib.SegmentPageByRAST1()
def processPngFile(outRoot, origFile, fileNum): baseName = os.path.basename(origFile) baseBase, _ = os.path.splitext(baseName) outDir = os.path.join(outRoot, "%s.%03d" % (baseBase, fileNum)) inFile = os.path.join(outDir, baseName) os.makedirs(outDir, exist_ok=True) shutil.copy(origFile, inFile) inBase, _ = ocrolib.allsplitext(inFile) print("** inBase=%s" % inBase) # print("** binBase=%s" % binBase) fname = inFile outputdir = inBase binFile = inBase + ".bin.png" outFile = inBase + ".out.png" outRoot2, outDir2 = os.path.split(outRoot) outFile2 = os.path.join(outRoot2, "%s.out" % outDir2, baseName) print("outFile2=%s" % outFile2) # assert False grayFile = inBase + ".nrm.png" psegFile = inBase + ".pseg.png" print(" inFile=%s" % inFile) print(" binFile=%s" % binFile) print("grayFile=%s" % grayFile) print(" outFile=%s" % outFile) assert inFile and binFile assert outFile != inFile assert outFile != binFile if not binarize(inFile, binFile, grayFile): binExists = os.path.exists(binFile) print("Couldn't binarize inFile=%s binFile=%s exists=%s" % (inFile, binFile, binExists)) return False binary = ocrolib.read_image_binary(binFile) print("$$ %s=%s" % (binFile, desc(binary))) height, width = binary.shape checktype(binary, ABINARY2) check = check_page(np.amax(binary) - binary) if check is not None: print("%s SKIPPED %s (use -n to disable this check)" % (inFile, check)) return False # if args.gray: # if os.path.exists(base+".nrm.png"): # gray = ocrolib.read_image_gray(base+".nrm.png") # checktype(gray, GRAYSCALE) # else: # print_error("Grayscale version %s.nrm.png not found. Use ocropus-nlbin for creating " + # "normalized grayscale version of the pages as well." % base) # return binary = 1 - binary # invert scale = psegutils.estimate_scale(binary) print("scale %f" % scale) if np.isnan(scale) or scale > 1000.0: print("%s: bad scale (%g); skipping\n" % (fname, scale)) return False # find columns and text lines print("computing segmentation") segmentation = compute_segmentation(binary, scale) if np.amax(segmentation) > maxlines: print("%s: too many lines %g" % (fname, np.amax(segmentation))) return False print("segmentation=%s" % desc(segmentation)) print("number of lines %g" % np.amax(segmentation)) # compute the reading order print("finding reading order") lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) print("$$ lsort = %d = %s...%s" % (len(lsort), lsort[:10], lsort[-10:])) # renumber the labels so that they conform to the specs nlabels = np.amax(segmentation) + 1 renumber = np.zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] # finally, output everything print("writing lines") if not os.path.exists(outputdir): os.mkdir(outputdir) lines = [lines[i] for i in lsort] ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation) cleaned = ocrolib.remove_noise(binary, noise) for i, l in enumerate(lines): binline = psegutils.extract_masked(1 - cleaned, l, pad=pad, expand=expand) ocrolib.write_image_binary("%s/01%04x.bin.png" % (outputdir, i + 1), binline) # if args.gray: # grayline = psegutils.extract_masked( # gray, l, pad=args.pad, expand=args.expand) # ocrolib.write_image_gray("%s/01%04x.nrm.png" % (outputdir, i+1), grayline) print("%6d %s %4.1f %d" % (i, fname, scale, len(lines))) # to proceed, we need a pseg file and a subdirectory containing text lines assert os.path.exists(psegFile), "%s: no such file" % psegFile assert os.path.isdir(inBase), "%s: no such directory" % inBase # iterate through the text lines in reading order, based on the page segmentation file pseg = ocrolib.read_page_segmentation(psegFile) print("$$ %s=%s" % (psegFile, desc(pseg))) regions = ocrolib.RegionExtractor() print("$$ regions=%s" % regions) regions.setPageLines(pseg) im = Image.open(inFile) print("~~%s %s" % (inFile, im.size)) print("$$ regions=%s=%s" % (regions, sorted(regions.__dict__))) print("$$ regions.length=%s" % regions.length()) n = regions.length() for i in range(1, n): id = regions.id(i) y0, x0, y1, x1 = regions.bbox(i) # print("%5d: 0x%05X %s %d x %d" % # (i, id, [y0, x0, y1, x1], y1 - y0, x1 - x0)) draw = ImageDraw.Draw(im) draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=3) draw.rectangle((x0, y0, x1, y1), outline=(0, 0, 255), width=0) # draw.rectangle((x0, y0, x1, y1), outline=255, width=5) # draw.rectangle((x0, y0, x1, y1), outline=10, width=1) del draw # write output files print("outFile=%s" % outFile) im.save(outFile, "PNG") print("outFile2=%s" % outFile2) outDir2 = os.path.dirname(outFile2) os.makedirs(outDir2, exist_ok=True) im.save(outFile2, "PNG") assert os.path.exists(outFile2) # outFile3, _ = os.path.splitext(outFile) # outFile3 = "%s.jpg" % outFile3 # print("outFile3=%s" % outFile3) # im.save(outFile3, "JPEG") # assert os.path.exists(outFile3) return True
def textline(self, arg): image = ocrolib.read_image_binary(arg) height, width = image.shape H = height W = width base, _ = ocrolib.allsplitext(arg) base2 = os.path.splitext(arg)[0] if not os.path.exists("%s/lines" % base): os.system("mkdir -p %s/lines" % base) #if os.path.exists(base2 + ".ts.png") : # f = ocrolib.read_image_binary(base2 + ".ts.png") # height, width = f.shape # os.system("python "+args.libpath+"/anyBaseOCR-nlbin.py %s.pf.bin.png" % base2) #else: # os.system("python "+args.libpath+"/anyBaseOCR-nlbin.py %s" % arg) #print("convert %s.ts.png %s/block-000.bin.png" % (base,base)) #os.system("convert %s.ts.png %s/block-000.bin.png" % (base,base)) #os.system("rm %s.bin.png %s.nrm.png" % (base, base)) file = open('%s/sorted_cuts.dat' % base, 'w') l = "0 0 " + str(int(width)) + " " + str( int(height)) + " 0 0 0 0\n" file.write(l) file.close() #if not os.path.exists("%s/lines" % base) : # os.system("mkdir %s/lines" % base) blockarray = [] if os.path.exists(base + "/sorted_cuts.dat"): blocks = open(base + "/sorted_cuts.dat", "r") i = 0 for block in blocks: words = block.split() blockarray.append((int(words[0]), -int(words[1]), int(words[2]), int(words[3]), i)) i += 1 else: blockarray.append((0, 0, width, height, 0)) i = 0 j = 0 lines = [] for block in blockarray: (x0, y0, x1, y1, i) = block y0 = -y0 #blockImage = "%s/block-%03d" % (base, i) os.system("convert %s.ts.png %s/temp.png" % (base, base)) img = Image.open("%s.ts.png" % base, 'r') img_w, img_h = img.size background = Image.new('RGBA', (W, H), (255, 255, 255, 255)) bg_w, bg_h = background.size offX = (bg_w - img_w) // 2 offY = (bg_h - img_h) // 2 offset = (offX, offY) background.paste(img, offset) background.save("%s/temp.png" % base) command = "python " + self.param[ 'libpath'] + "/cli/anyBaseOCR-gpageseg.py %s/temp.png -n --minscale %f --maxlines %f --scale %f --hscale %f --vscale %f --threshold %f --noise %d --maxseps %d --sepwiden %d --maxcolseps %d --csminaspect %f --csminheight %f -p %d -e %d -Q %d" % ( base, self.param['minscale'], self.param['maxlines'], self. param['scale'], self.param['hscale'], self.param['vscale'], self.param['threshold'], self.param['noise'], self.param['maxseps'], self.param['sepwiden'], self.param['maxcolseps'], self.param['csminaspect'], self.param['csminheight'], self.param['pad'], self.param['expand'], self.param['parallel']) if (self.param['blackseps']): command = command + " -b" if (self.param['usegauss']): command = command + " --usegauss" os.system(command) pseg = ocrolib.read_page_segmentation("%s/temp.pseg.png" % base) regions = ocrolib.RegionExtractor() regions.setPageLines(pseg) file = open('%s/sorted_lines.dat' % base, 'w') for h in range(1, regions.length()): id = regions.id(h) y0, x0, y1, x1 = regions.bbox(h) l = str(int(x0 - offX)) + " " + str( int(img_h - (y1 - offY))) + " " + str(int(x1 - offX)) + " " + str( int(img_h - (y0 - offY))) + " 0 0 0 0\n" file.write(l) filelist = glob.glob("%s/temp/*" % base) for infile in sorted(filelist): os.system("convert %s %s/lines/01%02x%02x.bin.png" % (infile, base, i + 1, j + 1)) lines.append("%s/lines/01%02x%02x.bin.png" % (base, i + 1, j + 1)) j += 1 os.system("rm -r %s/temp/" % base) os.system("rm %s/temp.png %s/temp.pseg.png" % (base, base)) i += 1 return lines