def precinct(self): h1_left = list(self.bboxes["h1"]) h1_left[-2] = float(h1_left[-2]) / 2 h1_left_chars = intersects_bbox(self.chars, h1_left) txt = h1_left_chars.groupby("top").apply(collate_chars).iloc[-1] p_id = "|".join(re.split(r"\s{2,}", txt)[1:3]) return p_id
def test_intersects_bbox(self): objs = [ # Is same as bbox { "x0": 0, "top": 0, "x1": 20, "bottom": 20, }, # Inside bbox { "x0": 10, "top": 10, "x1": 15, "bottom": 15, }, # Overlaps bbox { "x0": 10, "top": 10, "x1": 30, "bottom": 30, }, # Touching on one side { "x0": 20, "top": 0, "x1": 40, "bottom": 20, }, # Touching on one corner { "x0": 20, "top": 20, "x1": 40, "bottom": 40, }, # Fully outside { "x0": 21, "top": 21, "x1": 40, "bottom": 40, }, ] bbox = utils.obj_to_bbox(objs[0]) assert utils.intersects_bbox(objs, bbox) == objs[:4]
def words_to_edges_v(words, word_threshold=DEFAULT_MIN_WORDS_VERTICAL): """ Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words. """ # Find words that share the same left, right, or centerpoints by_x0 = utils.cluster_objects(words, "x0", 1) by_x1 = utils.cluster_objects(words, "x1", 1) by_center = utils.cluster_objects(words, lambda x: (x["x0"] + x["x1"])/2, 1) clusters = by_x0 + by_x1 + by_center # Find the points that align with the most words sorted_clusters = sorted(clusters, key=lambda x: -len(x)) large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters) # For each of those points, find the rectangles fitting all matching words rects = list(map(utils.objects_to_rect, large_clusters)) # Iterate through those rectangles, condensing overlapping rectangles condensed_rects = [] for rect in rects: overlap = False for c in condensed_rects: if utils.objects_overlap(rect, c): overlap = True break if overlap == False: condensed_rects.append(rect) if len(condensed_rects) == 0: return [] sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0"))) # Find the far-right boundary of the rightmost rectangle last_rect = sorted_rects[-1] while True: words_inside = utils.intersects_bbox( [ w for w in words if w["x0"] >= last_rect["x0"] ], (last_rect["x0"], last_rect["top"], last_rect["x1"], last_rect["bottom"]), ) rect = utils.objects_to_rect(words_inside) if rect == last_rect: break else: last_rect = rect # Describe all the left-hand edges of each text cluster edges = [ { "x0": b["x0"], "x1": b["x0"], "top": b["top"], "bottom": b["bottom"], "height": b["bottom"] - b["top"], "orientation": "v" } for b in sorted_rects ] + [ { "x0": last_rect["x1"], "x1": last_rect["x1"], "top": last_rect["top"], "bottom": last_rect["bottom"], "height": last_rect["bottom"] - last_rect["top"], "orientation": "v" } ] return edges