def my_filter(imageIn): MAX_CCS = 4000 count = 0 image = imageIn #imageIn.remove_border() ccs = image.cc_analysis() print "filter started on",len(ccs) ,"elements..." if len(ccs) < 1: raise ImageSegmentationError("there are no ccs") if len(ccs) > MAX_CCS: raise ImageSegmentationError("there are more than " + str(MAX_CCS) + " ccs.") median_black_area = median([cc.black_area()[0] for cc in ccs]) #filter long vertical runs left over from margins median_height = median([cc.nrows for cc in ccs]) for cc in ccs: if((cc.nrows / cc.ncols > 6) and (cc.nrows > 1.5 * median_height) ): cc.fill_white() del cc count = count + 1 for cc in ccs: if(cc.black_area()[0] > (median_black_area * 10)): cc.fill_white() del cc count = count + 1 for cc in ccs: if(cc.black_area()[0] < (median_black_area / 10)): cc.fill_white() del cc count = count + 1 print "filter done.",len(ccs)-count,"elements left."
def chars_make_words(lines_glyphs,threshold=None): """Groups the given glyphs to words based upon the horizontal distance between adjacent glyphs. Signature: ``chars_make_words (glyphs, threshold=None)`` with *glyphs*: A list of ``Cc`` data types, each of which representing a character. All glyphs must stem from the same single line of text. *threshold*: Horizontal white space greater than *threshold* will be considered a word separating gap. When ``None``, the threshold value is calculated automatically as 2.5 times teh median white space between adjacent glyphs. The result is a nested list of glyphs with each sublist representing a word. This is the same data structure as used in `Textline.words`_ .. _`Textline.words`: gamera.toolkits.ocr.classes.Textline.html """ glyphs = lines_glyphs[:] wordlist = [] if(threshold == None): spacelist = [] total_space = 0 for i in range(len(glyphs) - 1): spacelist.append(glyphs[i + 1].ul_x - glyphs[i].lr_x) if(len(spacelist) > 0): threshold = median(spacelist) threshold = threshold * 2.5 else: threshold = 0 word = [] for i in range(len(glyphs)): if i > 0: if((glyphs[i].ul_x - glyphs[i - 1].lr_x) > threshold): wordlist.append(word) word = [] word.append(glyphs[i]) if(len(word) > 0): wordlist.append(word) return wordlist
def my_filter(imageIn): count = 0 image = imageIn ccs = image.cc_analysis() print "filter started on", len(ccs), "elements..." median_black_area = median([cc.black_area()[0] for cc in ccs]) #also check for height? for cc in ccs: if (cc.black_area()[0] > (median_black_area * 10)): cc.fill_white() del cc count = count + 1 for cc in ccs: if (cc.black_area()[0] < (median_black_area / 10)): cc.fill_white() del cc count = count + 1 print "filter done.", len(ccs) - count, "elements left."
def my_filter(imageIn): count = 0 image = imageIn ccs = image.cc_analysis() print "filter started on",len(ccs) ,"elements..." median_black_area = median([cc.black_area()[0] for cc in ccs]) #also check for height? for cc in ccs: if(cc.black_area()[0] > (median_black_area * 10)): cc.fill_white() del cc count = count + 1 for cc in ccs: if(cc.black_area()[0] < (median_black_area / 10)): cc.fill_white() del cc count = count + 1 print "filter done.",len(ccs)-count,"elements left."
def performGreekOCR(options): import mahotas as mh import numpy as np from pylab import imshow, gray, show #from os import path from gamera.plugins import numpy_io # features = ["aspect_ratio", "volume64regions", "moments", "nholes_extended"] #I think these are size-invariant # features = ["aspect_ratio","moments","nholes","nholes_extended","skeleton_features","top_bottom","volume","volume16regions","volume64regions","zernike_moments"] MAX_CCS = 6500 features = ["aspect_ratio","moments","ncols_feature","nholes","nholes_extended","nrows_feature","skeleton_features","top_bottom","volume","volume16regions","volume64regions","zernike_moments"] image_files = [] g = GreekOCR(splits=options["split"],feats=features) g.mode = options["mode"] + "body" g.autogroup = options["autogroup"] g.debug = options["debug"] g.load_trainingdata(options['trainingdata']) g_appcrit = GreekOCR(splits=options["split"], feats=features) g_appcrit.mode = options["mode"] + "appcrit" g_appcrit.autogroup = options["autogroup"] g_appcrit.debug = options["debug"] g_appcrit.load_trainingdata(options['trainingdata']) if options["hocrfile"]: g.hocr = (options["hocrfile"]) if options["settingsfile"]: g.load_settings(options["settingsfile"]) g_appcrit.load_settings(options["settingsfile"]) if options["otsu"]: otsu_factors_string = options["otsu"].split(',') otsu_factors = [float(x) for x in otsu_factors_string] else: otsu_factors = [0] if options["directory"]: image_files = os.listdir(options["directory"]) image_files = [os.path.join(options["directory"],x) for x in image_files] test = re.compile(".png$",re.IGNORECASE) image_files = filter(test.search, image_files) image_files.sort() elif options["imagefile"]: image_files = options["imagefile"] image_file_count = 1; image_path = os.path.abspath(image_files[0]) image_split_path = os.path.split(image_path) book_code = os.path.split(image_split_path[0])[1] book_id = 0 if options.has_key("sql") and options["sql"]: book_id = sql_make_book_and_return_id(book_code) # if options.has_key("hocrout") and options["hocrout"]: # hocr_tree = hocr_make_tree_and_return(book_code) for image_file in image_files: image_path = os.path.abspath(image_file) image_split_path = os.path.split(image_path) book_code = os.path.split(image_split_path[0])[1]#directory name image_file_name = image_split_path[1] imageBase, imageEx = os.path.splitext(image_file_name) threshold_info = "" print "Now working with image: " + image_file_name internal_image_file_path = os.path.join(book_code, image_file_name) if imageEx == ".jp2": try: jp2Image = mh.imread(image_file, as_grey=True) jp2Image = jp2Image.astype(np.uint8) imageIn = numpy_io.from_numpy(jp2Image) except: print "Unexpected error:", sys.exc_info()[0] raise else: try: imageIn = load_image(image_file) except: continue imageType = imageIn.data.pixel_type if imageType != ONEBIT: if imageType != GREYSCALE: imageIn = imageIn.to_greyscale() otsu_thresh = imageIn.otsu_find_threshold() for otsu_factor in otsu_factors: if options.has_key("hocrout") and options["hocrout"]: hocr_tree = hocr_make_tree_and_return(book_code) if imageIn.data.pixel_type == ONEBIT: threshold_info = "thresh_128" otsu_thresh = 1.0 image = imageIn if options["debug"]: print "image is ONEBIT; doing no threshold optimization." else: current_thresh = otsu_thresh * otsu_factor if current_thresh > 253.0: current_thresh = 253.0 current_thresh = int(current_thresh) threshold_info = "thresh_" + str(int(current_thresh))# + "=" + str(otsu_factor) image = imageIn.threshold(current_thresh) print "Otsu factor: ", otsu_factor, " threshold: ", current_thresh if options["hocrfile"]: hocr_to_use = string.replace(options["hocrfile"],"%s",imageBase) g.hocr = hocr_to_use if options["debug"]: print "using '" + hocr_to_use + "' as hocr file" if options.has_key("filter") and options["filter"] == True: count = 0 ccs = image.cc_analysis() if options.has_key("debug"): if options["debug"] == True: print "filter started on",len(ccs) ,"elements..." #filter long vertical runs left over from margins ## #Agressive run filtering ## median_height = median([cc.nrows for cc in ccs]) ## for cc in ccs: ## #TODO: add another condition that keeps these at edges of page ## if((cc.nrows / cc.ncols > 6) and (cc.nrows > 1.5 * median_height) ): ## cc.fill_white() ## del cc ## count = count + 1 median_black_area = median([cc.black_area()[0] for cc in ccs]) for cc in ccs: if(cc.black_area()[0] > (median_black_area * 10)): cc.fill_white() del cc count = count + 1 for cc in ccs: if(cc.black_area()[0] < (median_black_area / 10)): cc.fill_white() del cc count = count + 1 if options.has_key("debug") and options["debug"] == True: print "filter done.",len(ccs)-count,"elements left." if (len(ccs) < 5) or (len(ccs) > MAX_CCS): print "Error: there are " + str(len(ccs)) + " ccs. Max is " + str( MAX_CCS) + " Omitting this image." #raise ImageSegmentationError("Error: there are " + str(len(ccs)) + " ccs. Max is " + str( MAX_CCS) + " Omitting this image.") else: if options.has_key("deskew") and options["deskew"] == True: #from gamera.toolkits.otr.otr_staff import * if options.has_key("debug") and options["debug"] == True: print "\ntry to skew correct..." rotation = image.rotation_angle_projections(-10,10)[0] img = image.rotate(rotation,0) if options.has_key("debug") and options["debug"] == True: print "rotated with",rotation,"angle" if options.has_key("mode") and options["mode"] == "teubner": (body_image, app_crit_image) = splitAppCritTeubner(image) output = g.process_image(body_image) if app_crit_image: print "there is an app. crit image" appcrit_output = g_appcrit.process_image(app_crit_image) else: print "there is no app. crit image" appcrit_output = "" output = output + appcrit_output else: output = g.process_image(image) output_file_name_base = options["unicodeoutfile"] + imageBase + "_" +imageEx[1:] + "_" + threshold_info if options.has_key("debug") and options["debug"] == True: g.save_debug_images(output_file_name_base) if options.has_key("mode") and options["mode"] == "teubner" and app_crit_image: #TODO: make more general g_appcrit.save_debug_images(output_file_name_base + "_appcrit") if options.has_key("hocrout") and options["hocrout"]: #if we turned this on, we would make a separate div for each page of input #hocr_tree = hocr_make_page_and_return_div(internal_image_file_path,image_file_count,book_id,hocr_tree) g.store_hocr(internal_image_file_path,hocr_tree) if options.has_key("mode") and options["mode"] == "teubner" and app_crit_image: g_appcrit.store_hocr(internal_image_file_path,hocr_tree) if options.has_key("sql") and options["sql"]: page_id = sql_make_page_and_return_id(internal_image_file_path,image_file_count,book_id) g.store_sql(image_path,page_id) if options.has_key("unicodeoutfile"): if options.has_key("hocrout") and options["hocrout"]: g.save_text_hocr(hocr_tree, output_file_name_base + ".html") else: g.save_text_unicode( output_file_name_base + ".txt") if options.has_key("mode") and options["mode"] == "teubner": #TODO: make the above more general g_appcrit.save_text_unicode( output_file_name_base + "_appcrit.txt") elif options.has_key("teubneroutfile"): g.save_text_teubner(options["teubneroutfile"]) else: print output image_file_count += 1
def __call__(self, Ex=-1, Ey=-1, iterations=2): # bbox with contained cc indices class Bbox: def __init__(self, allccs, indices): self.ccs = allccs self.indices = indices if len(indices) == 1: self.rect = Rect(allccs[indices[0]]) else: self.rect = allccs[indices[0]].union_images( [allccs[i] for i in indices]) def extend(self, Ex, Ey, img): ul_y = max(0, self.rect.ul_y - Ey) ul_x = max(0, self.rect.ul_x - Ex) lr_y = min(img.lr_y, self.rect.lr_y + Ey) lr_x = min(img.lr_x, self.rect.lr_x + Ex) nrows = lr_y - ul_y + 1 ncols = lr_x - ul_x + 1 self.rect = Rect(Point(ul_x, ul_y), Dim(ncols, nrows)) def merge(self, other): self.indices += other.indices self.rect.union(other.rect) # does one merging step def merge_boxes(bboxes): from gamera import graph bboxes.sort(lambda b1, b2: b1.rect.ul_y - b2.rect.ul_y) g = graph.Graph(graph.UNDIRECTED) # build graph where edge means overlap of two boxes for i in range(len(bboxes)): g.add_node(i) for i in range(len(bboxes)): for j in range(i + 1, len(bboxes)): if bboxes[j].rect.ul_y > bboxes[i].rect.lr_y: break if bboxes[i].rect.intersects(bboxes[j].rect): if not g.has_edge(i, j): g.add_edge(i, j) new_bboxes = [] for sg in g.get_subgraph_roots(): seg = [n() for n in g.BFS(sg)] bbox = bboxes[seg[0]] for i in range(1, len(seg)): bbox.merge(bboxes[seg[i]]) new_bboxes.append(bbox) return new_bboxes # the actual plugin from gamera.core import Dim, Rect, Point, Cc from gamera.plugins.listutilities import median page = self.image_copy() ccs = page.cc_analysis() # compute average CC size if Ex == -1: Ex = 2 * median([c.ncols for c in ccs]) if Ey == -1: Ey = median([c.nrows for c in ccs]) # create merged segments bboxes = [Bbox(ccs, [i]) for i in range(len(ccs))] for bb in bboxes: bb.extend(Ex, Ey, page) for i in range(iterations): oldlen = len(bboxes) bboxes = merge_boxes(bboxes) if oldlen == len(bboxes): break seg_ccs = [] for i, bbox in enumerate(bboxes): label = i + 1 ccs_of_segment = [ccs[j] for j in bbox.indices] for cc in ccs_of_segment: self.highlight(cc, label) seg_ccs.append( Cc(self, label, ccs_of_segment[0].union_rects(ccs_of_segment))) return seg_ccs
def __call__(self, Ex=-1, Ey=-1, iterations=2): # bbox with contained cc indices class Bbox: def __init__(self, allccs, indices): self.ccs = allccs self.indices = indices if len(indices) == 1: self.rect = Rect(allccs[indices[0]]) else: self.rect = allccs[indices[0]].union_images([allccs[i] for i in indices]) def extend(self, Ex, Ey, img): ul_y = max(0, self.rect.ul_y - Ey) ul_x = max(0, self.rect.ul_x - Ex) lr_y = min(img.lr_y, self.rect.lr_y + Ey) lr_x = min(img.lr_x, self.rect.lr_x + Ex) nrows = lr_y - ul_y + 1 ncols = lr_x - ul_x + 1 self.rect = Rect(Point(ul_x, ul_y), Dim(ncols, nrows)) def merge(self, other): self.indices += other.indices self.rect.union(other.rect) # does one merging step def merge_boxes(bboxes): from gamera import graph bboxes.sort(lambda b1, b2: b1.rect.ul_y-b2.rect.ul_y) g = graph.Graph(graph.UNDIRECTED) # build graph where edge means overlap of two boxes for i in range(len(bboxes)): g.add_node(i) for i in range(len(bboxes)): for j in range(i+1, len(bboxes)): if bboxes[j].rect.ul_y > bboxes[i].rect.lr_y: break if bboxes[i].rect.intersects(bboxes[j].rect): if not g.has_edge(i,j): g.add_edge(i,j) new_bboxes = [] for sg in g.get_subgraph_roots(): seg = [n() for n in g.BFS(sg)] bbox = bboxes[seg[0]] for i in range(1, len(seg)): bbox.merge(bboxes[seg[i]]) new_bboxes.append(bbox) return new_bboxes # the actual plugin from gamera.core import Dim, Rect, Point, Cc from gamera.plugins.listutilities import median page = self.image_copy() ccs = page.cc_analysis() # compute average CC size if Ex == -1: Ex = 2*median([c.ncols for c in ccs]) if Ey == -1: Ey = median([c.nrows for c in ccs]) # create merged segments bboxes = [Bbox(ccs, [i]) for i in range(len(ccs))] for bb in bboxes: bb.extend(Ex, Ey, page) for i in range(iterations): oldlen = len(bboxes) bboxes = merge_boxes(bboxes) if oldlen == len(bboxes): break seg_ccs = [] for i,bbox in enumerate(bboxes): label = i+1 ccs_of_segment = [ccs[j] for j in bbox.indices] for cc in ccs_of_segment: self.highlight(cc, label) seg_ccs.append(Cc(self, label, ccs_of_segment[0].union_rects(ccs_of_segment))) return seg_ccs