def __prepare_display(self): frames = [] lines = [] for rect in self.ltRects: if (rect.horizontal() and rect.height() > 8) or (rect.vertical() and rect.width() > 8): table = SingleCellTable([]) table.rect = rect frames.append(table) else: lines.append(rect) orphans = [] while len(lines) > 0: cluster = pdftable.cluster_rects(lines) if len(cluster) >= 4: try: frames.append(pdftable.Table(cluster)) continue except: pass orphans += cluster curves = sorted(self.curves + [pdftable.Curve(o.points()) for o in orphans], key=cmp_to_key(sort_topdown_ltr)) textLines = sorted(self.textLines, key=cmp_to_key(sort_topdown_ltr)) # explicit tables tables = [] for table in frames: orphans = [] bounds = table.bounds() for i in range(0, len(textLines)): line = textLines[i] if bounds.contains(line.bounds()): # Some pages have their "NOTES" section embedded inside the # table rectangle. What were you thinking, Intel? if line.font_name() == "NeoSansIntelMedium" and str( line).lower().startswith("notes"): orphans += textLines[i:] break table.get_at_pixel(line.rect.xmid(), line.rect.ymid()).append(line) else: orphans.append(line) textLines = orphans tables.append(table) # exception tables orphans = [] table_data = [] is_table_section = False expected_format = None for line in textLines: if line.font_name() == "NeoSansIntelMedium": orphans.append(line) title = str(line).strip().lower() if title[-10:] == "exceptions": is_table_section = True expected_format = exceptions_format__ elif title == "fpu flags affected": is_table_section = True expected_format = fpu_flags_format__ if is_table_section and len(table_data) > 0: tables.append(SingleCellTable(table_data)) table_data = [] continue if is_table_section: if line.bounds().x1() > 50: table_data.append(line) elif expected_format.search(str(line)) == None: orphans.append(line) if len(table_data) > 0: tables.append(SingleCellTable(table_data)) table_data = [] else: table_data.append(line) else: orphans.append(line) if len(table_data) > 0: tables.append(SingleCellTable(table_data)) # tables versus figures (versus useless frames) all_tables = sorted(tables, key=lambda x: x.bounds().area()) tables = set() figures = set() sublevel_figures = set() i = 0 while i < len(all_tables): smaller = all_tables[i] if smaller.rows() != 1 or smaller.columns() != 1: tables.add(smaller) else: j = i + 1 smaller_bounds = smaller.bounds() while j < len(all_tables): bigger = all_tables[j] if bigger.bounds().contains(smaller_bounds): bigger.get_at_pixel( smaller_bounds.xmid(), smaller_bounds.ymid()).append(smaller) figures.add(bigger) figures.add(smaller) sublevel_figures.add(smaller) break j += 1 else: tables.add(smaller) i += 1 top_figures = [Figure(t) for t in figures - sublevel_figures] top_tables = list(tables - figures) orphanCurves = [] for figure in top_figures: for curve in curves: if figure.bounds().contains(curve.bounds()): figure.data.get_at(0, 0).append(curve) else: orphanCurves.append(curve) curves = orphanCurves orphanCurves = [] i = 0 while i < len(top_tables): count = top_tables[i].item_count() if count > 1: i += 1 else: if count == 1: orphans += top_tables[i].get_at(0, 0) top_tables.pop(i) # lists textLines = self.__merge_text(orphans) orphans = [] lists = [] this_list = [] i = 0 while i < len(textLines): line = textLines[i] if line.chars[0].get_text() == "•": if len(line.chars) == 1: i += 1 line = textLines[i] else: for j in range(1, len(line.chars)): if not line.chars[j].get_text().isspace(): break line.chars = line.chars[j:] this_list.append(line) else: if len(this_list) > 0: lists.append(pdftable.List(this_list)) this_list = [] orphans.append(line) i += 1 displayable = self.__merge_text(orphans) + top_tables + top_figures displayable.sort(key=cmp_to_key(sort_topdown_ltr)) return displayable
def process_curve(self, curve): curve = pdftable.Curve([self.__fix_point(p) for p in curve.pts]) self.curves.append(curve)