Beispiel #1
0
def snap_edges(edges, tolerance=DEFAULT_SNAP_TOLERANCE):
    """
    Given a list of edges, snap any within `tolerance` pixels of one another to their positional average.
    """
    v, h = [ list(filter(lambda x: x["orientation"] == o, edges))
        for o in ("v", "h") ]

    v = [ move_to_avg(cluster, "v")
        for cluster in utils.cluster_objects(v, "x0", tolerance) ]

    h = [ move_to_avg(cluster, "h")
        for cluster in utils.cluster_objects(h, "top", tolerance) ]

    snapped = list(itertools.chain(*(v + h)))
    return snapped
Beispiel #2
0
def words_to_edges_h(words,
    word_threshold=DEFAULT_MIN_WORDS_HORIZONTAL):
    """
    Find (imaginary) horizontal lines that connect the tops of at least `word_threshold` words.
    """
    by_top = utils.cluster_objects(words, "top", 1)
    large_clusters = filter(lambda x: len(x) >= word_threshold, by_top)
    rects = list(map(utils.objects_to_rect, large_clusters))
    if len(rects) == 0:
        return []
    min_x0 = min(map(itemgetter("x0"), rects))
    max_x1 = max(map(itemgetter("x1"), rects))
    edges = [ {
        "x0": min_x0,
        "x1": max_x1,
        "top": r["top"],
        "bottom": r["top"],
        "width": max_x1 - min_x0,
        "orientation": "h"
    } for r in rects ] + [ {
        "x0": min_x0,
        "x1": max_x1,
        "top": r["bottom"],
        "bottom": r["bottom"],
        "width": max_x1 - min_x0,
        "orientation": "h"
    } for r in rects ]

    return edges
def extract_text(chars, x_tolerance=utils.DEFAULT_X_TOLERANCE,
                 y_tolerance=utils.DEFAULT_Y_TOLERANCE):
    if len(chars) == 0:
        return None

    chars = utils.to_list(chars)
    doctop_clusters = utils.cluster_objects(chars, "doctop", y_tolerance)

    lines = (collate_line(line_chars, x_tolerance)
             for line_chars in doctop_clusters)

    coll = "|&|".join(lines)
    return coll
 def test_cluster_objects(self):
     a = ["a", "ab", "abc", "b"]
     assert utils.cluster_objects(a, len, 0) == [["a", "b"], ["ab"],
                                                 ["abc"]]
Beispiel #5
0
def words_to_edges_v(words,
    word_threshold=DEFAULT_MIN_WORDS_VERTICAL):
    """
    Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words.
    """
    # Find words that share the same left, right, or centerpoints
    by_x0 = utils.cluster_objects(words, "x0", 1)
    by_x1 = utils.cluster_objects(words, "x1", 1)
    by_center = utils.cluster_objects(words, lambda x: (x["x0"] + x["x1"])/2, 1)
    clusters = by_x0 + by_x1 + by_center
    
    # Find the points that align with the most words
    sorted_clusters = sorted(clusters, key=lambda x: -len(x))
    large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
    
    # For each of those points, find the rectangles fitting all matching words
    rects = list(map(utils.objects_to_rect, large_clusters))
    
    # Iterate through those rectangles, condensing overlapping rectangles
    condensed_rects = []
    for rect in rects:
        overlap = False
        for c in condensed_rects:
            if utils.objects_overlap(rect, c):
                overlap = True
                break
        if overlap == False:
            condensed_rects.append(rect)
            
    if len(condensed_rects) == 0:
        return []
    sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0")))

    # Find the far-right boundary of the rightmost rectangle
    last_rect = sorted_rects[-1]
    while True:
        words_inside = utils.intersects_bbox(
            [ w for w in words if w["x0"] >= last_rect["x0"] ],
            (last_rect["x0"], last_rect["top"], last_rect["x1"], last_rect["bottom"]), 
        )
        rect = utils.objects_to_rect(words_inside)
        if rect == last_rect:
            break
        else:
            last_rect = rect
    
    # Describe all the left-hand edges of each text cluster
    edges = [ {
        "x0": b["x0"],
        "x1": b["x0"],
        "top": b["top"],
        "bottom": b["bottom"],
        "height": b["bottom"] - b["top"],
        "orientation": "v"
    } for b in sorted_rects ] + [ {
        "x0": last_rect["x1"],
        "x1": last_rect["x1"],
        "top": last_rect["top"],
        "bottom": last_rect["bottom"],
        "height": last_rect["bottom"] - last_rect["top"],
        "orientation": "v"
    } ]
    
    return edges
def extract_words(page,
                  x_tolerance=DEFAULT_X_TOLERANCE,
                  y_tolerance=DEFAULT_Y_TOLERANCE,
                  keep_blank_chars=False):
    x_tolerance = decimalize(x_tolerance)
    y_tolerance = decimalize(y_tolerance)

    def process_word_chars(chars):
        x0, top, x1, bottom = objects_to_bbox(chars)
        return {
            "x0": x0,
            "x1": x1,
            "top": top,
            "bottom": bottom,
            "text": "".join(map(itemgetter("text"), chars)),
            "chars": chars
        }

    def make_set_clusters(doctop_cluster):
        new_clusters = []
        for c in doctop_cluster:
            new_cluster = [simplejson.dumps(c[i]) for i in range(len(c))]
            new_cluster = list(set(new_cluster))
            cluster_to_dict = []
            for i in range(len(new_cluster)):
                d = simplejson.loads(new_cluster[i])
                for k in d.keys():
                    if type(d[k]) == float:
                        d[k] = Decimal(str(d[k]))
                cluster_to_dict.append(d)
            new_clusters.append(cluster_to_dict)
        return new_clusters

    def check_two_chars(char1, char2):
        if abs(char1['x0'] - char2['x0']) < 1:
            return False
        return True

    def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE):
        get_text = itemgetter("text")
        chars_sorted = sorted(chars, key=itemgetter("x0"))
        new_chars_sorted = []
        for i in range(len(chars_sorted)):
            if i == 0 or check_two_chars(chars_sorted[i], chars_sorted[i - 1]):
                new_chars_sorted.append(chars_sorted[i])
        chars_sorted = new_chars_sorted
        words = []
        current_word = []

        for char in chars_sorted:
            if not keep_blank_chars and get_text(char).isspace():
                if len(current_word) > 0:
                    words.append(current_word)
                    current_word = []
                else:
                    pass
            elif len(current_word) == 0:
                current_word.append(char)
            else:
                last_char = current_word[-1]
                if char["x0"] > (last_char["x1"] + tolerance):
                    words.append(current_word)
                    current_word = []
                current_word.append(char)

        if len(current_word) > 0:
            words.append(current_word)
        processed_words = list(map(process_word_chars, words))
        return processed_words

    chars = to_list(page.chars)
    doctop_clusters = cluster_objects(chars, "doctop", y_tolerance)
    doctop_clusters = make_set_clusters(doctop_clusters)
    nested = [
        get_line_words(line_chars, tolerance=x_tolerance)
        for line_chars in doctop_clusters
    ]
    # text = ''.join([nested[2][i]['x0'] for i in range(len(nested[2]))])
    # x0 = [nested[2][i]['x0'] for i in range(len(nested[2]))]
    # print(x0)
    # print(nested[2])
    # print(2 / 0)

    words = list(itertools.chain(*nested))
    return words
Beispiel #7
0
    def fetch(self) -> Iterator[Journal]:
        resp = requests.get(self.url)
        resp.raise_for_status()

        content_length = resp.headers.get("Content-Length", None)
        content_length = int(
            content_length) if content_length is not None else None
        cached_file = cache_in_memory(resp, size=content_length)
        with pdfplumber.open(cached_file) as pdf:

            def get_entries() -> Iterator[str]:
                from pdfplumber.utils import cluster_objects, extract_text, DEFAULT_X_TOLERANCE
                import unicodedata

                # fontname_regex = re.compile(r"([A-Z]{6})\+([A-Za-z]+)(\d+)?")
                small_font_size_threshold = Decimal("8.0")

                def is_font_bold(char: PDFChar) -> bool:
                    tag, fontname = char["fontname"].split("+")
                    return "BX" in fontname

                def is_font_small(char: PDFChar) -> bool:
                    return char["size"] < small_font_size_threshold

                def normalize_char(
                        char: PDFChar,
                        interpreter: PDFPageInterpreter) -> Optional[PDFChar]:
                    text = char["text"]
                    if len(text) > 1 and (
                            cid_match :=
                            cid_regex.fullmatch(text)) is not None:
                        cid = int(cid_match.group(1))
                        text = cmap_char(cid, char["fontname"], interpreter)
                        if text is None:
                            char["text"] = None
                            return char

                    ntext = unicodedata.normalize("NFKC", text)
                    if len(ntext) == 2 and unicodedata.combining(ntext[1]):
                        text = ntext[1]

                    text = make_combining_form(text) or text
                    if is_font_small(char):
                        if text == "o":
                            text = "°"

                    char["text"] = text
                    return char

                def sort_line_chars(
                        chars: Sequence[PDFChar],
                        interpreter: PDFPageInterpreter) -> Sequence[PDFChar]:
                    chars = (normalize_char(char, interpreter)
                             for char in chars)
                    chars = sorted(chars, key=lambda char: char["x0"])
                    main_chars, combining_chars = partition(
                        lambda char: char["text"] and unicodedata.combining(
                            char["text"]), chars)
                    combining_chars_iter = peekable(iter(combining_chars))
                    for main_char in main_chars:
                        yield main_char

                        while combining_chars_iter:
                            combining_char = combining_chars_iter.peek()

                            overlap = max(
                                min(main_char["x1"], combining_char["x1"]) -
                                max(main_char["x0"], combining_char["x0"]), 0)
                            if overlap < main_char["width"] * Decimal("0.5"):
                                break

                            yield combining_char
                            next(combining_chars_iter, None)

                    assert (next(combining_chars_iter, None) is None)

                    return
                    yield

                x_tolerance = Decimal("3.0")
                y_tolerance = Decimal("3.0")
                min_tab_width = Decimal("8.0")

                for page in pdf.pages:
                    device = PDFPageAggregator(
                        pdf.rsrcmgr,
                        pageno=page.page_number,
                        laparams=pdf.laparams,
                    )
                    interpreter = PDFPageInterpreter(pdf.rsrcmgr, device)
                    interpreter.process_page(page.page_obj)

                    contents = page.crop(
                        (
                            Decimal(100),
                            Decimal(70 +
                                    (200 if page.page_number == 1 else 0)),
                            page.width - Decimal(100),
                            page.height - Decimal(70),
                        ),
                        relative=False,
                    )
                    left_column = contents.crop(
                        (
                            Decimal(0),
                            Decimal(0),
                            contents.width * Decimal(0.5),
                            contents.height,
                        ),
                        relative=True,
                    )
                    right_column = contents.crop(
                        (
                            contents.width * Decimal(0.5),
                            Decimal(0),
                            contents.width,
                            contents.height,
                        ),
                        relative=True,
                    )

                    for column in (left_column, right_column):
                        bold_chars = filter(is_font_bold, column.chars)
                        bold_char_lines = cluster_objects(
                            bold_chars, "top", y_tolerance)
                        bold_line_y0s = (min(char["top"] for char in line)
                                         for line in bold_char_lines)

                        hsep_y0s = chain(bold_line_y0s, (column.bbox[3], ))
                        hsep_y0s = list(hsep_y0s)
                        for y0, y1 in windowed(hsep_y0s, 2):
                            if y1 is None:
                                break
                            entry = column.within_bbox(
                                (
                                    column.bbox[0],
                                    max(y0 - y_tolerance, column.bbox[1]),
                                    column.bbox[2],
                                    min(y1 + y_tolerance, column.bbox[3]),
                                ),
                                relative=False,
                            )

                            entry_lines = cluster_objects(
                                entry.chars, "top", y_tolerance)
                            entry_text = StringIO()

                            # TODO: refactor into separate top-level function, along with sort_line_chars, normalize_char.
                            for line_chars in entry_lines:
                                line_chars = list(line_chars)
                                last_char: Optional[PDFChar] = None
                                for char in sort_line_chars(
                                        line_chars, interpreter):
                                    if last_char is not None and last_char[
                                            "text"] is not None:
                                        if char["x0"] > last_char[
                                                "x1"] + min_tab_width:
                                            entry_text.write("\t")
                                        elif char["x0"] > last_char[
                                                "x1"] + x_tolerance:
                                            entry_text.write(" ")

                                    if char["text"] is not None:
                                        entry_text.write(char["text"])
                                        if not unicodedata.combining(
                                                char["text"]):
                                            last_char = char

                                entry_text.write("\n")

                            yield unicodedata.normalize(
                                "NFKC", entry_text.getvalue())

                return
                yield

            journal = Journal()
            for entry in get_entries():
                print(f"ENTRY: {entry}")
                # TODO: handle `[name in other language]` bits.
                pass

            if journal is not None and journal.names and journal.iso4:
                yield journal
Beispiel #8
0
def combine(obj, attr, x_tolerance=3, y_tolerance=3, keep_blank_chars=False):
    """

    General combine function

        chars --> words
            or
        words --> combined_words

    :param obj: char list or words list, in pdfplumber format
    :param attr:  "chars" or "words", denoting what to combine
    :return: combined_objs
    """
    def process_word_chars(chars):
        x0, top, x1, bottom = objects_to_bbox(chars)
        return {
            "x0": x0,
            "x1": x1,
            "top": top,
            "bottom": bottom,
            "text": " ".join(map(itemgetter("text"), chars))
        }

    def get_line_words(chars, tolerance=3):
        get_text = itemgetter("text")
        chars_sorted = sorted(chars, key=itemgetter("x0"))
        words = []
        current_word = []

        for char in chars_sorted:
            if not keep_blank_chars and get_text(char).isspace():
                if len(current_word) > 0:
                    words.append(current_word)
                    current_word = []
                else:
                    pass
            elif len(current_word) == 0:
                current_word.append(char)
            else:
                last_char = current_word[-1]
                if char["x0"] > (last_char["x1"] + tolerance):
                    words.append(current_word)
                    current_word = []
                current_word.append(char)

        if len(current_word) > 0:
            words.append(current_word)
        processed_words = list(map(process_word_chars, words))
        return processed_words

    ### cluster_objects requires different things for combining chars/words
    if attr == "chars":
        attr = "doctop"
    elif attr == "words":
        attr = "top"

    clusters = cluster_objects(obj, attr, y_tolerance)
    nested = [
        get_line_words(line_chars, tolerance=x_tolerance)
        for line_chars in clusters
    ]

    combined_objs = list(itertools.chain(*nested))
    return combined_objs