def snap_edges(edges, tolerance=DEFAULT_SNAP_TOLERANCE): """ Given a list of edges, snap any within `tolerance` pixels of one another to their positional average. """ v, h = [ list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h") ] v = [ move_to_avg(cluster, "v") for cluster in utils.cluster_objects(v, "x0", tolerance) ] h = [ move_to_avg(cluster, "h") for cluster in utils.cluster_objects(h, "top", tolerance) ] snapped = list(itertools.chain(*(v + h))) return snapped
def words_to_edges_h(words, word_threshold=DEFAULT_MIN_WORDS_HORIZONTAL): """ Find (imaginary) horizontal lines that connect the tops of at least `word_threshold` words. """ by_top = utils.cluster_objects(words, "top", 1) large_clusters = filter(lambda x: len(x) >= word_threshold, by_top) rects = list(map(utils.objects_to_rect, large_clusters)) if len(rects) == 0: return [] min_x0 = min(map(itemgetter("x0"), rects)) max_x1 = max(map(itemgetter("x1"), rects)) edges = [ { "x0": min_x0, "x1": max_x1, "top": r["top"], "bottom": r["top"], "width": max_x1 - min_x0, "orientation": "h" } for r in rects ] + [ { "x0": min_x0, "x1": max_x1, "top": r["bottom"], "bottom": r["bottom"], "width": max_x1 - min_x0, "orientation": "h" } for r in rects ] return edges
def extract_text(chars, x_tolerance=utils.DEFAULT_X_TOLERANCE, y_tolerance=utils.DEFAULT_Y_TOLERANCE): if len(chars) == 0: return None chars = utils.to_list(chars) doctop_clusters = utils.cluster_objects(chars, "doctop", y_tolerance) lines = (collate_line(line_chars, x_tolerance) for line_chars in doctop_clusters) coll = "|&|".join(lines) return coll
def test_cluster_objects(self): a = ["a", "ab", "abc", "b"] assert utils.cluster_objects(a, len, 0) == [["a", "b"], ["ab"], ["abc"]]
def words_to_edges_v(words, word_threshold=DEFAULT_MIN_WORDS_VERTICAL): """ Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words. """ # Find words that share the same left, right, or centerpoints by_x0 = utils.cluster_objects(words, "x0", 1) by_x1 = utils.cluster_objects(words, "x1", 1) by_center = utils.cluster_objects(words, lambda x: (x["x0"] + x["x1"])/2, 1) clusters = by_x0 + by_x1 + by_center # Find the points that align with the most words sorted_clusters = sorted(clusters, key=lambda x: -len(x)) large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters) # For each of those points, find the rectangles fitting all matching words rects = list(map(utils.objects_to_rect, large_clusters)) # Iterate through those rectangles, condensing overlapping rectangles condensed_rects = [] for rect in rects: overlap = False for c in condensed_rects: if utils.objects_overlap(rect, c): overlap = True break if overlap == False: condensed_rects.append(rect) if len(condensed_rects) == 0: return [] sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0"))) # Find the far-right boundary of the rightmost rectangle last_rect = sorted_rects[-1] while True: words_inside = utils.intersects_bbox( [ w for w in words if w["x0"] >= last_rect["x0"] ], (last_rect["x0"], last_rect["top"], last_rect["x1"], last_rect["bottom"]), ) rect = utils.objects_to_rect(words_inside) if rect == last_rect: break else: last_rect = rect # Describe all the left-hand edges of each text cluster edges = [ { "x0": b["x0"], "x1": b["x0"], "top": b["top"], "bottom": b["bottom"], "height": b["bottom"] - b["top"], "orientation": "v" } for b in sorted_rects ] + [ { "x0": last_rect["x1"], "x1": last_rect["x1"], "top": last_rect["top"], "bottom": last_rect["bottom"], "height": last_rect["bottom"] - last_rect["top"], "orientation": "v" } ] return edges
def extract_words(page, x_tolerance=DEFAULT_X_TOLERANCE, y_tolerance=DEFAULT_Y_TOLERANCE, keep_blank_chars=False): x_tolerance = decimalize(x_tolerance) y_tolerance = decimalize(y_tolerance) def process_word_chars(chars): x0, top, x1, bottom = objects_to_bbox(chars) return { "x0": x0, "x1": x1, "top": top, "bottom": bottom, "text": "".join(map(itemgetter("text"), chars)), "chars": chars } def make_set_clusters(doctop_cluster): new_clusters = [] for c in doctop_cluster: new_cluster = [simplejson.dumps(c[i]) for i in range(len(c))] new_cluster = list(set(new_cluster)) cluster_to_dict = [] for i in range(len(new_cluster)): d = simplejson.loads(new_cluster[i]) for k in d.keys(): if type(d[k]) == float: d[k] = Decimal(str(d[k])) cluster_to_dict.append(d) new_clusters.append(cluster_to_dict) return new_clusters def check_two_chars(char1, char2): if abs(char1['x0'] - char2['x0']) < 1: return False return True def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE): get_text = itemgetter("text") chars_sorted = sorted(chars, key=itemgetter("x0")) new_chars_sorted = [] for i in range(len(chars_sorted)): if i == 0 or check_two_chars(chars_sorted[i], chars_sorted[i - 1]): new_chars_sorted.append(chars_sorted[i]) chars_sorted = new_chars_sorted words = [] current_word = [] for char in chars_sorted: if not keep_blank_chars and get_text(char).isspace(): if len(current_word) > 0: words.append(current_word) current_word = [] else: pass elif len(current_word) == 0: current_word.append(char) else: last_char = current_word[-1] if char["x0"] > (last_char["x1"] + tolerance): words.append(current_word) current_word = [] current_word.append(char) if len(current_word) > 0: words.append(current_word) processed_words = list(map(process_word_chars, words)) return processed_words chars = to_list(page.chars) doctop_clusters = cluster_objects(chars, "doctop", y_tolerance) doctop_clusters = make_set_clusters(doctop_clusters) nested = [ get_line_words(line_chars, tolerance=x_tolerance) for line_chars in doctop_clusters ] # text = ''.join([nested[2][i]['x0'] for i in range(len(nested[2]))]) # x0 = [nested[2][i]['x0'] for i in range(len(nested[2]))] # print(x0) # print(nested[2]) # print(2 / 0) words = list(itertools.chain(*nested)) return words
def fetch(self) -> Iterator[Journal]: resp = requests.get(self.url) resp.raise_for_status() content_length = resp.headers.get("Content-Length", None) content_length = int( content_length) if content_length is not None else None cached_file = cache_in_memory(resp, size=content_length) with pdfplumber.open(cached_file) as pdf: def get_entries() -> Iterator[str]: from pdfplumber.utils import cluster_objects, extract_text, DEFAULT_X_TOLERANCE import unicodedata # fontname_regex = re.compile(r"([A-Z]{6})\+([A-Za-z]+)(\d+)?") small_font_size_threshold = Decimal("8.0") def is_font_bold(char: PDFChar) -> bool: tag, fontname = char["fontname"].split("+") return "BX" in fontname def is_font_small(char: PDFChar) -> bool: return char["size"] < small_font_size_threshold def normalize_char( char: PDFChar, interpreter: PDFPageInterpreter) -> Optional[PDFChar]: text = char["text"] if len(text) > 1 and ( cid_match := cid_regex.fullmatch(text)) is not None: cid = int(cid_match.group(1)) text = cmap_char(cid, char["fontname"], interpreter) if text is None: char["text"] = None return char ntext = unicodedata.normalize("NFKC", text) if len(ntext) == 2 and unicodedata.combining(ntext[1]): text = ntext[1] text = make_combining_form(text) or text if is_font_small(char): if text == "o": text = "°" char["text"] = text return char def sort_line_chars( chars: Sequence[PDFChar], interpreter: PDFPageInterpreter) -> Sequence[PDFChar]: chars = (normalize_char(char, interpreter) for char in chars) chars = sorted(chars, key=lambda char: char["x0"]) main_chars, combining_chars = partition( lambda char: char["text"] and unicodedata.combining( char["text"]), chars) combining_chars_iter = peekable(iter(combining_chars)) for main_char in main_chars: yield main_char while combining_chars_iter: combining_char = combining_chars_iter.peek() overlap = max( min(main_char["x1"], combining_char["x1"]) - max(main_char["x0"], combining_char["x0"]), 0) if overlap < main_char["width"] * Decimal("0.5"): break yield combining_char next(combining_chars_iter, None) assert (next(combining_chars_iter, None) is None) return yield x_tolerance = Decimal("3.0") y_tolerance = Decimal("3.0") min_tab_width = Decimal("8.0") for page in pdf.pages: device = PDFPageAggregator( pdf.rsrcmgr, pageno=page.page_number, laparams=pdf.laparams, ) interpreter = PDFPageInterpreter(pdf.rsrcmgr, device) interpreter.process_page(page.page_obj) contents = page.crop( ( Decimal(100), Decimal(70 + (200 if page.page_number == 1 else 0)), page.width - Decimal(100), page.height - Decimal(70), ), relative=False, ) left_column = contents.crop( ( Decimal(0), Decimal(0), contents.width * Decimal(0.5), contents.height, ), relative=True, ) right_column = contents.crop( ( contents.width * Decimal(0.5), Decimal(0), contents.width, contents.height, ), relative=True, ) for column in (left_column, right_column): bold_chars = filter(is_font_bold, column.chars) bold_char_lines = cluster_objects( bold_chars, "top", y_tolerance) bold_line_y0s = (min(char["top"] for char in line) for line in bold_char_lines) hsep_y0s = chain(bold_line_y0s, (column.bbox[3], )) hsep_y0s = list(hsep_y0s) for y0, y1 in windowed(hsep_y0s, 2): if y1 is None: break entry = column.within_bbox( ( column.bbox[0], max(y0 - y_tolerance, column.bbox[1]), column.bbox[2], min(y1 + y_tolerance, column.bbox[3]), ), relative=False, ) entry_lines = cluster_objects( entry.chars, "top", y_tolerance) entry_text = StringIO() # TODO: refactor into separate top-level function, along with sort_line_chars, normalize_char. for line_chars in entry_lines: line_chars = list(line_chars) last_char: Optional[PDFChar] = None for char in sort_line_chars( line_chars, interpreter): if last_char is not None and last_char[ "text"] is not None: if char["x0"] > last_char[ "x1"] + min_tab_width: entry_text.write("\t") elif char["x0"] > last_char[ "x1"] + x_tolerance: entry_text.write(" ") if char["text"] is not None: entry_text.write(char["text"]) if not unicodedata.combining( char["text"]): last_char = char entry_text.write("\n") yield unicodedata.normalize( "NFKC", entry_text.getvalue()) return yield journal = Journal() for entry in get_entries(): print(f"ENTRY: {entry}") # TODO: handle `[name in other language]` bits. pass if journal is not None and journal.names and journal.iso4: yield journal
def combine(obj, attr, x_tolerance=3, y_tolerance=3, keep_blank_chars=False): """ General combine function chars --> words or words --> combined_words :param obj: char list or words list, in pdfplumber format :param attr: "chars" or "words", denoting what to combine :return: combined_objs """ def process_word_chars(chars): x0, top, x1, bottom = objects_to_bbox(chars) return { "x0": x0, "x1": x1, "top": top, "bottom": bottom, "text": " ".join(map(itemgetter("text"), chars)) } def get_line_words(chars, tolerance=3): get_text = itemgetter("text") chars_sorted = sorted(chars, key=itemgetter("x0")) words = [] current_word = [] for char in chars_sorted: if not keep_blank_chars and get_text(char).isspace(): if len(current_word) > 0: words.append(current_word) current_word = [] else: pass elif len(current_word) == 0: current_word.append(char) else: last_char = current_word[-1] if char["x0"] > (last_char["x1"] + tolerance): words.append(current_word) current_word = [] current_word.append(char) if len(current_word) > 0: words.append(current_word) processed_words = list(map(process_word_chars, words)) return processed_words ### cluster_objects requires different things for combining chars/words if attr == "chars": attr = "doctop" elif attr == "words": attr = "top" clusters = cluster_objects(obj, attr, y_tolerance) nested = [ get_line_words(line_chars, tolerance=x_tolerance) for line_chars in clusters ] combined_objs = list(itertools.chain(*nested)) return combined_objs