def precinct(self):
     h1_left = list(self.bboxes["h1"])
     h1_left[-2] = float(h1_left[-2]) / 2
     h1_left_chars = intersects_bbox(self.chars, h1_left)
     txt = h1_left_chars.groupby("top").apply(collate_chars).iloc[-1]
     p_id = "|".join(re.split(r"\s{2,}", txt)[1:3])
     return p_id
Ejemplo n.º 2
0
    def test_intersects_bbox(self):
        objs = [
            # Is same as bbox
            {
                "x0": 0,
                "top": 0,
                "x1": 20,
                "bottom": 20,
            },
            # Inside bbox
            {
                "x0": 10,
                "top": 10,
                "x1": 15,
                "bottom": 15,
            },
            # Overlaps bbox
            {
                "x0": 10,
                "top": 10,
                "x1": 30,
                "bottom": 30,
            },
            # Touching on one side
            {
                "x0": 20,
                "top": 0,
                "x1": 40,
                "bottom": 20,
            },
            # Touching on one corner
            {
                "x0": 20,
                "top": 20,
                "x1": 40,
                "bottom": 40,
            },
            # Fully outside
            {
                "x0": 21,
                "top": 21,
                "x1": 40,
                "bottom": 40,
            },
        ]
        bbox = utils.obj_to_bbox(objs[0])

        assert utils.intersects_bbox(objs, bbox) == objs[:4]
Ejemplo n.º 3
0
def words_to_edges_v(words,
    word_threshold=DEFAULT_MIN_WORDS_VERTICAL):
    """
    Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words.
    """
    # Find words that share the same left, right, or centerpoints
    by_x0 = utils.cluster_objects(words, "x0", 1)
    by_x1 = utils.cluster_objects(words, "x1", 1)
    by_center = utils.cluster_objects(words, lambda x: (x["x0"] + x["x1"])/2, 1)
    clusters = by_x0 + by_x1 + by_center
    
    # Find the points that align with the most words
    sorted_clusters = sorted(clusters, key=lambda x: -len(x))
    large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
    
    # For each of those points, find the rectangles fitting all matching words
    rects = list(map(utils.objects_to_rect, large_clusters))
    
    # Iterate through those rectangles, condensing overlapping rectangles
    condensed_rects = []
    for rect in rects:
        overlap = False
        for c in condensed_rects:
            if utils.objects_overlap(rect, c):
                overlap = True
                break
        if overlap == False:
            condensed_rects.append(rect)
            
    if len(condensed_rects) == 0:
        return []
    sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0")))

    # Find the far-right boundary of the rightmost rectangle
    last_rect = sorted_rects[-1]
    while True:
        words_inside = utils.intersects_bbox(
            [ w for w in words if w["x0"] >= last_rect["x0"] ],
            (last_rect["x0"], last_rect["top"], last_rect["x1"], last_rect["bottom"]), 
        )
        rect = utils.objects_to_rect(words_inside)
        if rect == last_rect:
            break
        else:
            last_rect = rect
    
    # Describe all the left-hand edges of each text cluster
    edges = [ {
        "x0": b["x0"],
        "x1": b["x0"],
        "top": b["top"],
        "bottom": b["bottom"],
        "height": b["bottom"] - b["top"],
        "orientation": "v"
    } for b in sorted_rects ] + [ {
        "x0": last_rect["x1"],
        "x1": last_rect["x1"],
        "top": last_rect["top"],
        "bottom": last_rect["bottom"],
        "height": last_rect["bottom"] - last_rect["top"],
        "orientation": "v"
    } ]
    
    return edges