def update_preview(self): if self.pages is not None: pdf_file = self.pages[self.current_page - 1] pdf_file.seek(0) document = Document(stream=pdf_file, filetype="PDF") image = QPixmap() page = document.loadPage(0) container_size = self.preview_box.parent().size() normalized_zoom_factor = min( page.rect.height / container_size.height(), page.rect.width / container_size.width(), ) scale_mat = Matrix(normalized_zoom_factor, normalized_zoom_factor) image.loadFromData(page.getPixmap(matrix=scale_mat).getPNGData()) image_size = container_size * self.zoom_factor self.preview_box.setPixmap( image.scaled(image_size, Qt.KeepAspectRatio)) self.preview_box.resize(image_size) else: self.preview_box.setTextFormat(Qt.RichText) self.preview_box.setStyleSheet("QLabel { color : darkgray; }") self.preview_box.setText( f'<center><img src="{appctxt.get_resource("logo.png")}") /><br />Wähle auf der rechten Seite<br />eine Datei und ein Papierformat<br />aus um die Vorschau anzuzeigen</center>' ) self.preview_box.resize(QSize(200, 200))
def manga_to_PDF(dir_path, one_file=True, width=None, height=None): if dir_path[-1] == '\\' or dir_path[-1] == '/': dir_path = dir_path[0:-1] if not os.path.isdir(dir_path): raise ValueError('传入的路径并非文件夹') from fitz import Document, Pixmap, Rect from glob import _iglob as glob if one_file: title = os.path.basename(dir_path) with Document() as doc: for file_path in glob(os.path.join(dir_path, "*", "*.jpg"), False, False): pixmap = Pixmap(file_path) if width and height: pixmap = Pixmap(pixmap, width, height, None) elif width: pixmap = Pixmap(pixmap, width, int(pixmap.height / pixmap.width * width), None) elif height: pixmap = Pixmap(pixmap, int(pixmap.width / pixmap.height * height), height, None) rect = Rect(0, 0, pixmap.width, pixmap.height) page = doc.newPage(width=pixmap.width, height=pixmap.height) page.insertImage(rect, pixmap=pixmap) doc.save(os.path.join(dir_path, title + ".pdf"), deflate=True) else: for chap in glob(os.path.join(dir_path, "*"), False, True): title = os.path.basename(chap) with Document() as doc: for file_path in glob(os.path.join(chap, "*.jpg"), False, False): pixmap = Pixmap(file_path) if width and height: pixmap = Pixmap(pixmap, width, height, None) elif width: pixmap = Pixmap( pixmap, width, int(pixmap.height / pixmap.width * width), None) elif height: pixmap = Pixmap( pixmap, int(pixmap.width / pixmap.height * height), height, None) rect = Rect(0, 0, pixmap.width, pixmap.height) page = doc.newPage(width=pixmap.width, height=pixmap.height) page.insertImage(rect, pixmap=pixmap) doc.save(os.path.join(dir_path, title + ".pdf"), deflate=True)
def inspect(file: Union[str, bytes, Path]) -> PdfRedactionsDict: """ Inspect a file for bad redactions and return a Dict with their info :file: The PDF to process, as bytes if you have the file in memory (useful if it's coming from the network), as a unicode string if you know the path to the file on your local disk, or as a pathlib.Path object. :return: A dict with the bad redaction information. If no bad redactions are found, returns an empty dict. """ if type(file) == bytes: pdf = Document(stream=file, filetype="pdf") elif type(file) == str and file.startswith("https://"): r = requests.get(file, timeout=10) r.raise_for_status() pdf = Document(stream=r.content, filetype="pdf") else: # str filepath or Pathlib Path pdf = Document(file) bad_redactions = {} for page_number, page in enumerate(pdf, start=1): redactions = get_bad_redactions(page) if redactions: bad_redactions[page_number] = redactions pdf.close() bad_redactions = check_if_all_dates(bad_redactions) return bad_redactions
def extract_meta(doc: Document, pattern: str, page: Optional[int] = None, ign_case: bool = False) -> List[dict]: """Extract meta for a `pattern` on `page` in a pdf document Arguments doc: document from pymupdf pattern: a regular expression pattern page: page number (1-based index), if None is given, search for the entire document, but this is highly discouraged. ign_case: ignore case? """ result = [] if page is None: pages = doc.pages() elif 1 <= page <= doc.pageCount: pages = [doc[page - 1]] else: # page out of range return result regex = re.compile(pattern, re.IGNORECASE) if ign_case else re.compile(pattern) # we could parallelize this, but I don't see a reason # to *not* specify a page number for p in pages: result.extend(search_in_page(regex, p)) return result
def attact_annotation(annot: Annotation, doc: fitz.Document): page: fitz.Page = doc.loadPage(annot.pageNum) atype = annot.annotType[0] ann_new = None if atype == fitz.ANNOT_FREETEXT: ann_new = page.addFreetextAnnot(annot.rect, annot.info['content']) elif atype == fitz.ANNOT_TEXT: ann_new = page.addTextAnnot(annot.point, annot.info['content']) elif atype == fitz.ANNOT_HIGHLIGHT: ann_new = page.addHighlightAnnot(annot.quads) elif atype == fitz.ANNOT_STRIKEOUT: ann_new = page.addStrikeOUTAnnot(annot.quads) elif atype == fitz.ANNOT_SQUIGGLY: ann_new = page.addSquigglyAnnot(annot.quads) elif atype == fitz.ANNOT_UNDERLINE: ann_new = page.addUnderlineAnnot(annot.quads) else: print( f'Annotation type {annot.annotType} is not supported. Ignore: {annot}' ) if ann_new: ann_new: fitz.Annot ann_new.setInfo(annot.info) ann_new.setColors(annot.colors) ann_new.setBorder(annot.border) if annot.lineEnds: ann_new.setLineEnds(*annot.lineEnds) ann_new.setOpacity(annot.opacity)
def extract_info_from_pdf(doc: fitz.Document, user: str) -> [Train]: trains_list = list() for i in range(doc.pageCount): page = doc.loadPage(i) text = page.getText("text") train = extract_info_from_text(text, user) if train is not None: trains_list.append(train) return trains_list
def read_book(file): """ This function allows you to read book :file: must be string formatted .pdf or .epub or .fb2 file. Other formats will raise exception. """ if file.endswith('.pdf') or file.endswith('.epub') or file.endswith( '.fb2'): return Document(file)
def save_transform_pdf( case_id: str, pdf_file: FileStorage, ): __init_time = time.time() __total_rendering_time = 0 case_dir_path = __case_dir_path(case_id) os.makedirs(case_dir_path, exist_ok=True) pdf_path = "%s%08d.pdf" % (parent_dir, rd.randint(0, 1e7)) pdf_file.save(pdf_path) pdf = Document(pdf_path) refs_size = pdf._getXrefLength() img_cnt = 0 q = Queue() # 遍历每一个对象 for i in range(1, refs_size): # 定义对象字符串 text = pdf._getXrefString(i) is_object = re.search(checkXO, text) # 使用正则表达式查看是否是图片 is_img = re.search(checkIM, text) # 如果不是对象也不是图片,则continue if not is_object or not is_img: continue img_cnt += 1 path = '%s%08d.png' % (case_dir_path, img_cnt) # 根据索引生成图像 q.put((pdf_path, i, path)) procs = [ Process(target=render, name="P%d" % _pi, args=(q, )) for _pi in range(64) ] for proc in procs: print('start', proc.name) proc.start() for proc in procs: proc.join() print("total time: %.4f" % (time.time() - __init_time))
def _is_device_cs(xref, doc: fitz.Document): '''Check whether object xref is a device based color space. ''' # cs definition obj_contents = doc.xrefObject(xref) # for now, just check /ICCBased CS: # it's treated as a device based cs if /Device[Gray|RGB|CMYK] exists in /Alternate. # # [ /ICCBased 15 0 R ] # # << # /Alternate /DeviceRGB # /Filter /FlateDecode # /Length 2597 # /N 3 # >> if '/ICCBased' in obj_contents: name, x, *_ = obj_contents[1:-1].strip().split() ICC_contents = doc.xrefObject(int(x)) return '/Alternate /Device' in ICC_contents # ignore all other color spaces, may include if facing associated cases return False
def return_page(doc: Document, page_num=0): """ This function allows you to read any page of your file in html format. :doc: Document with all pages. :page_num: Number of document page, default value = 0. Wrong number raise exception. """ if page_num < 0 or page_num >= doc.page_count: raise Exception("Wrong page") page = doc.loadPage(page_num) return page
def show_page(doc: Document, page_num=0): """ This function allows you to read any page of your file in html format. :doc: Document with all pages. :page_num: Number of document page, default value = 0. Wrong number raise exception. """ if page_num < 0 or page_num >= doc.page_count: raise Exception("Wrong page") pix = doc.loadPage(page_num).get_pixmap() fmt = QImage.Format_RGBA8888 if pix.alpha else QImage.Format_RGB888 qtimg = QImage(pix.samples, pix.width, pix.height, pix.stride, fmt) return qtimg
def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]: """Extract toc entries from a document Arguments doc: a pdf document recipe: recipe from user Returns a list of toc entries in the document """ result = [] for page in doc.pages(): for blk in page.getTextPage().extractDICT().get('blocks', []): result.extend( recipe.extract_block(blk, page.number + 1) ) return result
def render(q: Queue): __pdf_path = None __pdf_doc = None while not q.empty(): pdf_path, xref, save_to = q.get() if __pdf_path == pdf_path: pdf_doc = __pdf_doc else: print('Worker Read') pdf_doc = Document(pdf_path) __pdf_path = pdf_path __pdf_doc = pdf_doc pix = fitz.Pixmap(pdf_doc, xref) if pix.n < 5: pix.writePNG(save_to) # 否则先转换CMYK else: fitz.Pixmap(fitz.csRGB, pix).writePNG(save_to)
def _check_device_cs(doc:fitz.Document, page:fitz.Page): '''Get all color space name used in current page and check if they're device based color space.''' # default device based cs cs = { '/DeviceGray': True, '/DeviceRGB' : True, '/DeviceCMYK': True } # content of page object, e.g. # << # ... # /Resources << # ... # /ColorSpace << # /Cs6 14 0 R # >> # >> # /Rotate 0 # /Type /Page # >> obj_contents = doc.xrefObject(page.xref) cs_found = False for line_ in obj_contents.splitlines(): line = line_.strip() # check start/end of color space block if not cs_found and line.startswith('/ColorSpace'): cs_found = True continue if not cs_found: continue elif line=='>>': break # now within cs block, e.g. /Cs6 14 0 R cs_name, xref, *_ = line.split() cs[cs_name] = _is_device_cs(int(xref), doc) return cs
def test_reading_books_test1(self): test_file = "c++_trofumenko.pdf" actual_result = read_book(test_file).page_count expected_result = Document(test_file).page_count self.assertEqual(actual_result, expected_result)
def write_toc(doc: Document, toc: List[ToCEntry]): """Write table of contents to a document""" fitz_toc = list(map(lambda e: e.to_fitz_entry(), toc)) doc.set_toc(fitz_toc)
def read_toc(doc: Document) -> List[ToCEntry]: """Read table of contents from a document""" return [ToCEntry(*entry) for entry in doc.get_toc()]
page_delta = args.delta[0] + 1 else: page_delta = 1 if args.start: start = args.start[0] else: start = 0 if args.end: end = args.end[0] results_file = args.output[0] font_standard = args.font_size[0] pdf_file = args.pdf_file[0] names_file = args.names[0] with open(names_file, "r") as file: names_list = tuple(map(lambda x: str(x).strip("\n"), file.readlines())) doc = Document(pdf_file) if end is None: end = len(doc) results = open(results_file, "a") for name in names_list: print(f"Parsing: {name}") word_in_pages = [] results.write(f"{name} ") for page in range(start, end): lines = unescape(doc[page].getText("html")).split("\n") for line in lines: item = line.replace("\t", " ") bs = BeautifulSoup(item, "html.parser") if bs.p is not None: if name.title() in str(bs.p.text) and str( bs.p["style"]).find("top:84pt") == -1: