def outlines_caj(self): self.log('outlines') if self.check_file_exist() is False: self.log('--------') return try: caj = CAJParser(self.line_address.text()) toc = caj.get_toc() add_outlines(toc, self.getDefaultOutput(), "tmp.pdf") replace("tmp.pdf", self.getDefaultOutput()) self.log('output: %s' % self.getDefaultOutput()) except Exception as e: self.log('Exception: %s' % e) self.log('--------')
def addindex(): global originpdf global origincaj try: cajadd = CAJParser(origincaj) toc = cajadd.get_toc() tmp=os.path.dirname(originpdf)+'/'+'tmp.pdf' print(tmp) add_outlines(toc, originpdf, tmp) #pdfname=os.path.basename(originpdf)#获取待加目录pdf的文件名 #base=os.path.dirname(originpdf)#获取待加目录pdf的路径 os.replace(tmp,originpdf) #print('replace') #调试用的 messagebox.showinfo('提示','完成') except: messagebox.showinfo('错误','请检查是否选中caj或pdf')
def addindex(): global originpdf global origincaj try: cajadd = CAJParser(origincaj) toc = cajadd.get_toc() tmp = os.path.dirname(originpdf) + '/' + 'tmp.pdf' print(tmp) add_outlines(toc, originpdf, tmp) #pdfname=os.path.basename(originpdf)#获取待加目录pdf的文件名 #base=os.path.dirname(originpdf)#获取待加目录pdf的路径 os.replace(tmp, originpdf) #print('replace') #调试用的 messagebox.showinfo('提示', '完成') except: messagebox.showinfo('错误', '请检查是否选中caj或pdf')
def outlines_caj(self): self.log('outlines') if self.check_file_exist(self.caj_address.text(), '.caj') is False: self.log('--------') return if self.check_file_exist(self.pdf_address.text(), '.pdf') is False: self.log('--------') return try: caj = CAJParser(self.caj_address.text()) toc = caj.get_toc() add_outlines(toc, self.pdf_address.text(), "tmp.pdf") move("tmp.pdf", self.pdf_address.text()) self.log('output: %s' % self.pdf_address.text()) except Exception as e: self.log('Exception: %s' % e) self.log('--------')
def _convert_caj(self, dest): caj = open(self.filename, "rb") # Extract original PDF data (and add header) caj.seek(self._PAGE_NUMBER_OFFSET + 4) [pdf_start_pointer] = struct.unpack("i", caj.read(4)) caj.seek(pdf_start_pointer) [pdf_start] = struct.unpack("i", caj.read(4)) pdf_end = fnd_all(caj, b"endobj")[-1] + 6 pdf_length = pdf_end - pdf_start caj.seek(pdf_start) pdf_data = b"%PDF-1.3\r\n" + caj.read(pdf_length) + b"\r\n" with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # deal with disordered PDF data endobj_addr = fnd_all(pdf, b"endobj") pdf_data = b"%PDF-1.3\r\n" obj_no = [] for addr in endobj_addr: startobj = fnd_rvrs(pdf, b" 0 obj", addr) startobj1 = fnd_rvrs(pdf, b"\r", startobj) startobj2 = fnd_rvrs(pdf, b"\n", startobj) startobj = max(startobj1, startobj2) length = fnd(pdf, b" ", startobj) - startobj pdf.seek(startobj) [no] = struct.unpack(str(length) + "s", pdf.read(length)) if int(no) not in obj_no: obj_no.append(int(no)) obj_len = addr - startobj + 6 pdf.seek(startobj) [obj] = struct.unpack(str(obj_len) + "s", pdf.read(obj_len)) pdf_data += (b"\r" + obj) pdf_data += b"\r\n" with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # Add Catalog (find obj_no of pages) inds_addr = [i + 8 for i in fnd_all(pdf, b"/Parent")] inds = [] for addr in inds_addr: length = fnd(pdf, b" ", addr) - addr pdf.seek(addr) [ind] = struct.unpack(str(length) + "s", pdf.read(length)) inds.append(int(ind)) # get pages_obj_no list containing distinct elements # & find missing pages object(s) -- top pages object(s) in pages_obj_no pages_obj_no = [] top_pages_obj_no = [] for ind in inds: if (ind not in pages_obj_no) and (ind not in top_pages_obj_no): if fnd(pdf, bytes("\r{0} 0 obj".format(ind), "utf-8")) == -1: top_pages_obj_no.append(ind) else: pages_obj_no.append(ind) single_pages_obj_missed = len(top_pages_obj_no) == 1 multi_pages_obj_missed = len(top_pages_obj_no) > 1 # generate catalog object catalog_obj_no = fnd_unuse_no(obj_no, top_pages_obj_no) obj_no.append(catalog_obj_no) root_pages_obj_no = None if multi_pages_obj_missed: root_pages_obj_no = fnd_unuse_no(obj_no, top_pages_obj_no) elif single_pages_obj_missed: root_pages_obj_no = top_pages_obj_no[0] top_pages_obj_no = pages_obj_no else: # root pages object exists, then find the root pages object # found = False for pon in pages_obj_no: tmp_addr = fnd(pdf, bytes("\r{0} 0 obj".format(pon), 'utf-8')) while True: pdf.seek(tmp_addr) [_str] = struct.unpack("6s", pdf.read(6)) if _str == b"Parent": break elif _str == b"endobj": root_pages_obj_no = pon found = True break tmp_addr = tmp_addr + 1 if found: break catalog = bytes( "{0} 0 obj\r<</Type /Catalog\r/Pages {1} 0 R\r>>\rendobj\r".format( catalog_obj_no, root_pages_obj_no), "utf-8") pdf_data += catalog with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # Add Pages obj and EOF mark # if root pages object exist, pass # deal with single missing pages object if single_pages_obj_missed or multi_pages_obj_missed: inds_str = ["{0} 0 R".format(i) for i in top_pages_obj_no] kids_str = "[{0}]".format(" ".join(inds_str)) pages_str = "{0} 0 obj\r<<\r/Type /Pages\r/Kids {1}\r/Count {2}\r>>\rendobj\r".format( root_pages_obj_no, kids_str, self.page_num) pdf_data += bytes(pages_str, "utf-8") with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # deal with multiple missing pages objects if multi_pages_obj_missed: kids_dict = {i: [] for i in top_pages_obj_no} count_dict = {i: 0 for i in top_pages_obj_no} for tpon in top_pages_obj_no: kids_addr = fnd_all( pdf, bytes("/Parent {0} 0 R".format(tpon), "utf-8")) for kid in kids_addr: ind = fnd_rvrs(pdf, b"obj", kid) - 4 addr = fnd_rvrs(pdf, b"\r", ind) length = fnd(pdf, b" ", addr) - addr pdf.seek(addr) [ind] = struct.unpack(str(length) + "s", pdf.read(length)) kids_dict[tpon].append(int(ind)) type_addr = fnd(pdf, b"/Type", addr) + 5 tmp_addr = fnd(pdf, b"/", type_addr) + 1 pdf.seek(tmp_addr) [_type] = struct.unpack("5s", pdf.read(5)) if _type == b"Pages": cnt_addr = fnd(pdf, b"/Count ", addr) + 7 pdf.seek(cnt_addr) [_str] = struct.unpack("1s", pdf.read(1)) cnt_len = 0 while _str not in [b" ", b"\r", b"/"]: cnt_len += 1 pdf.seek(cnt_addr + cnt_len) [_str] = struct.unpack("1s", pdf.read(1)) pdf.seek(cnt_addr) [cnt] = struct.unpack( str(cnt_len) + "s", pdf.read(cnt_len)) count_dict[tpon] += int(cnt) else: # _type == b"Page" count_dict[tpon] += 1 kids_no_str = ["{0} 0 R".format(i) for i in kids_dict[tpon]] kids_str = "[{0}]".format(" ".join(kids_no_str)) pages_str = "{0} 0 obj\r<<\r/Type /Pages\r/Kids {1}\r/Count {2}\r>>\rendobj\r".format( tpon, kids_str, count_dict[tpon]) pdf_data += bytes(pages_str, "utf-8") pdf_data += bytes("\n%%EOF\r", "utf-8") with open("pdf.tmp", 'wb') as f: f.write(pdf_data) # Use mutool to repair xref try: check_output(["./mutool", "clean", "pdf.tmp", "pdf_toc.pdf"], stderr=STDOUT) except CalledProcessError as e: print(e.output.decode("utf-8")) raise SystemExit("Command mutool returned non-zero exit status " + str(e.returncode)) # Add Outlines add_outlines(self.get_toc(), "pdf_toc.pdf", dest) pdf.close() os.remove("pdf.tmp") os.remove("pdf_toc.pdf")
def _convert_hn(self, dest): caj = open(self.filename, "rb") image_list = [] from pdfwutils import Colorspace, ImageFormat, convert_ImageList import zlib for i in range(self.page_num): caj.seek(self._TOC_END_OFFSET + i * 20) [ page_data_offset, size_of_text_section, images_per_page, page_no, unk2, next_page_data_offset ] = struct.unpack("iihhii", caj.read(20)) caj.seek(page_data_offset) text_header_read32 = caj.read(32) if (text_header_read32[8:20] == b'COMPRESSTEXT'): [expanded_text_size] = struct.unpack("i", text_header_read32[20:24]) import zlib caj.seek(page_data_offset + 24) data = caj.read(size_of_text_section - 24) output = zlib.decompress(data, bufsize=expanded_text_size) if (len(output) != expanded_text_size): raise SystemExit("Unexpected:", len(output), expanded_text_size) else: caj.seek(page_data_offset) output = caj.read(size_of_text_section) from HNParsePage import HNParsePage page_style = (next_page_data_offset > page_data_offset) page_data = HNParsePage(output, page_style) if (images_per_page > 1): if (len(page_data.figures) == images_per_page): image_list.append(None) image_list.append(page_data.figures) else: print("Page %d, Image Count %d != %d" % (i + 1, len(page_data.figures), images_per_page)) image_list.append(None) image_list.append(page_data.figures[0:images_per_page]) current_offset = page_data_offset + size_of_text_section for j in range(images_per_page): caj.seek(current_offset) read32 = caj.read(32) [image_type_enum, offset_to_image_data, size_of_image_data] = struct.unpack("iii", read32[0:12]) if (offset_to_image_data != current_offset + 12): raise SystemExit("unusual image offset") caj.seek(offset_to_image_data) image_data = caj.read(size_of_image_data) current_offset = offset_to_image_data + size_of_image_data if (image_type[image_type_enum] == "JBIG"): from jbigdec import CImage cimage = CImage(image_data) out = cimage.DecodeJbig() # PBM is only padded to 8 rather than 32. # If the padding is larger, write padded file. width = cimage.width if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)): width = cimage.bytes_per_line << 3 image_item = (Colorspace.P, (300, 300), ImageFormat.PBM, zlib.compress(out), width, cimage.height, [0xffffff, 0], False, 1, 0) elif (image_type[image_type_enum] == "JBIG2"): from jbig2dec import CImage cimage = CImage(image_data) out = cimage.DecodeJbig2() # PBM is only padded to 8 rather than 32. # If the padding is larger, write padded file. width = cimage.width if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)): width = cimage.bytes_per_line << 3 image_item = (Colorspace.P, (300, 300), ImageFormat.PBM, zlib.compress(out), width, cimage.height, [0xffffff, 0], False, 1, 0) elif (image_type[image_type_enum] == "JPEG"): # stock libjpeg location (SOFn, frame_length, bits_per_pixel, height, width) = struct.unpack(">HHBHH", image_data[158:167]) if (SOFn != 0xFFC0): # "Intel(R) JPEG Library" location (SOFn, frame_length, bits_per_pixel, height, width) = struct.unpack(">HHBHH", image_data[0x272:0x27b]) if (SOFn != 0xFFC0): # neither works, try brute-force import imagesize with open(".tmp.jpg", "wb") as f: f.write(image_data) (width, height) = imagesize.get(".tmp.jpg") os.remove(".tmp.jpg") if (image_type_enum == 1): # non-inverted JPEG Images height = -height image_item = (Colorspace.RGB, (300, 300), ImageFormat.JPEG, image_data, width, height, [], False, 8, 0) else: raise SystemExit("Unknown Image Type %d" % (image_type_enum)) image_list.append(image_item) if (len(image_list) == 0): raise SystemExit("File is pure-text HN; cannot convert to pdf") pdf_data = convert_ImageList(image_list) with open('pdf_toc.pdf', 'wb') as f: f.write(pdf_data) # Add Outlines add_outlines(self.get_toc(), "pdf_toc.pdf", dest) os.remove("pdf_toc.pdf")
format(args.input, caj.format, caj.page_num, caj.toc_num)) if args.command == "convert": caj = CAJParser(args.input) if args.output is None: if args.input.endswith(".caj"): args.output = args.input.replace(".caj", ".pdf") elif (len(args.input) > 4 and (args.input[-4] == '.' or args.input[-3] == '.') and not args.input.endswith(".pdf")): args.output = os.path.splitext(args.input)[0] + ".pdf" else: args.output = args.input + ".pdf" caj.convert(args.output) if args.command == "outlines": caj = CAJParser(args.input) if caj.format == "PDF" or caj.format == "KDH": raise SystemExit("Unsupported file type: {0}.".format(caj.format)) toc = caj.get_toc() add_outlines(toc, args.output, "tmp.pdf") os.replace("tmp.pdf", args.output) if args.command == "text-extract": caj = CAJParser(args.input) caj.text_extract() if args.command == "parse": caj = CAJParser(args.input) caj.parse()
def _convert_caj(self, dest): caj = open(self.filename, "rb") # Extract original PDF data (and add header) caj.seek(self._PAGE_NUMBER_OFFSET + 4) [pdf_start_pointer] = struct.unpack("i", caj.read(4)) caj.seek(pdf_start_pointer) [pdf_start] = struct.unpack("i", caj.read(4)) pdf_end = fnd_all(caj, b"endobj")[-1] + 6 pdf_length = pdf_end - pdf_start caj.seek(pdf_start) pdf_data = b"%PDF-1.3\r\n" + caj.read(pdf_length) + b"\r\n" with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # Add Catalog (find obj_no of pages) inds_addr = [i + 8 for i in fnd_all(pdf, b"/Parent")] inds = [] for addr in inds_addr: pdf.seek(addr) length = 0 while True: [s] = struct.unpack("s", pdf.read(1)) if s == b" ": break else: length += 1 pdf.seek(addr + length) pdf.seek(addr) [ind] = struct.unpack(str(length) + "s", pdf.read(length)) inds.append(int(ind)) pages_obj_no = min(inds) catalog = bytes( "1 0 obj\r<</Type /Catalog\r/Pages {0} 0 R\r>>\rendobj\r".format( pages_obj_no), "utf-8") pdf_data += catalog with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # Add Pages obj and EOF mark if fnd(pdf, bytes("\r{0} 0 obj\r<<".format(pages_obj_no), "utf-8")) == -1: kids_addr = fnd_all( pdf, bytes("/Parent {0} 0 R".format(pages_obj_no), "utf-8")) inds_addr = [] for kid in kids_addr: ind = kid - 6 while True: pdf.seek(ind) [obj_str] = struct.unpack("6s", pdf.read(6)) if obj_str == b"obj\r<<": break else: ind = ind - 1 ind -= 1 pdf.seek(ind) while True: [s] = struct.unpack("s", pdf.read(1)) if s == b"\r": break else: ind -= 1 pdf.seek(ind) inds_addr.append(ind + 1) inds = [] for addr in inds_addr: pdf.seek(addr) length = 0 while True: [s] = struct.unpack("s", pdf.read(1)) if s == b" ": break else: length += 1 pdf.seek(addr + length) pdf.seek(addr) [ind] = struct.unpack(str(length) + "s", pdf.read(length)) inds.append(int(ind)) inds_str = ["{0} 0 R".format(i) for i in inds] kids_str = "[{0}]".format(" ".join(inds_str)) pages_str = "{0} 0 obj\r<<\r/Type /Pages\r/Kids {1}\r/Count {2}\r>>\rendobj".format( pages_obj_no, kids_str, self.page_num) pdf_data += bytes(pages_str, "utf-8") pdf_data += bytes("\r\n%%EOF\r", "utf-8") with open("pdf.tmp", 'wb') as f: f.write(pdf_data) # Use mutool to repair xref call(["mutool", "clean", "pdf.tmp", "pdf_toc.pdf"]) # Add Outlines add_outlines(self.get_toc(), "pdf_toc.pdf", dest) call(["rm", "-f", "pdf.tmp"]) call(["rm", "-f", "pdf_toc.pdf"])
def _convert_caj(self, dest): caj = open(self.filename, "rb") # Extract original PDF data (and add header) caj.seek(self._PAGE_NUMBER_OFFSET + 4) [pdf_start_pointer] = struct.unpack("i", caj.read(4)) caj.seek(pdf_start_pointer) [pdf_start] = struct.unpack("i", caj.read(4)) pdf_end = fnd_all(caj, b"endobj")[-1] + 6 pdf_length = pdf_end - pdf_start caj.seek(pdf_start) pdf_data = b"%PDF-1.3\r\n" + caj.read(pdf_length) + b"\r\n" with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # deal with disordered PDF data endobj_addr = fnd_all(pdf, b"endobj") pdf_data = b"%PDF-1.3\r\n" obj_no = [] for addr in endobj_addr: startobj = fnd_rvrs(pdf, b" 0 obj", addr) startobj1 = fnd_rvrs(pdf, b"\r", startobj) startobj2 = fnd_rvrs(pdf, b"\n", startobj) startobj = max(startobj1, startobj2) length = fnd(pdf, b" ", startobj) - startobj pdf.seek(startobj) [no] = struct.unpack(str(length) + "s", pdf.read(length)) if int(no) not in obj_no: obj_no.append(int(no)) obj_len = addr - startobj + 6 pdf.seek(startobj) [obj] = struct.unpack(str(obj_len) + "s", pdf.read(obj_len)) pdf_data += (b"\r" + obj) pdf_data += b"\r\n" with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # Add Catalog (find obj_no of pages) inds_addr = [i + 8 for i in fnd_all(pdf, b"/Parent")] inds = [] for addr in inds_addr: length = fnd(pdf, b" ", addr) - addr pdf.seek(addr) [ind] = struct.unpack(str(length) + "s", pdf.read(length)) inds.append(int(ind)) # get pages_obj_no list containing distinct elements # & find missing pages object(s) -- top pages object(s) in pages_obj_no pages_obj_no = [] top_pages_obj_no = [] for ind in inds: if (ind not in pages_obj_no) and (ind not in top_pages_obj_no): if fnd(pdf, bytes("\r{0} 0 obj".format(ind), "utf-8")) == -1: top_pages_obj_no.append(ind) else: pages_obj_no.append(ind) single_pages_obj_missed = len(top_pages_obj_no) == 1 multi_pages_obj_missed = len(top_pages_obj_no) > 1 # generate catalog object catalog_obj_no = fnd_unuse_no(obj_no, top_pages_obj_no) obj_no.append(catalog_obj_no) root_pages_obj_no = None if multi_pages_obj_missed: root_pages_obj_no = fnd_unuse_no(obj_no, top_pages_obj_no) elif single_pages_obj_missed: root_pages_obj_no = top_pages_obj_no[0] top_pages_obj_no = pages_obj_no else: # root pages object exists, then find the root pages object # found = False for pon in pages_obj_no: tmp_addr = fnd(pdf, bytes("\r{0} 0 obj".format(pon), 'utf-8')) while True: pdf.seek(tmp_addr) [_str] = struct.unpack("6s", pdf.read(6)) if _str == b"Parent": break elif _str == b"endobj": root_pages_obj_no = pon found = True break tmp_addr = tmp_addr + 1 if found: break catalog = bytes("{0} 0 obj\r<</Type /Catalog\r/Pages {1} 0 R\r>>\rendobj\r".format( catalog_obj_no, root_pages_obj_no), "utf-8") pdf_data += catalog with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # Add Pages obj and EOF mark # if root pages object exist, pass # deal with single missing pages object if single_pages_obj_missed or multi_pages_obj_missed: inds_str = ["{0} 0 R".format(i) for i in top_pages_obj_no] kids_str = "[{0}]".format(" ".join(inds_str)) pages_str = "{0} 0 obj\r<<\r/Type /Pages\r/Kids {1}\r/Count {2}\r>>\rendobj\r".format( root_pages_obj_no, kids_str, self.page_num) pdf_data += bytes(pages_str, "utf-8") with open("pdf.tmp", 'wb') as f: f.write(pdf_data) pdf = open("pdf.tmp", "rb") # deal with multiple missing pages objects if multi_pages_obj_missed: kids_dict = {i: [] for i in top_pages_obj_no} count_dict = {i: 0 for i in top_pages_obj_no} for tpon in top_pages_obj_no: kids_addr = fnd_all(pdf, bytes("/Parent {0} 0 R".format(tpon), "utf-8")) for kid in kids_addr: ind = fnd_rvrs(pdf, b"obj", kid) - 4 addr = fnd_rvrs(pdf, b"\r", ind) length = fnd(pdf, b" ", addr) - addr pdf.seek(addr) [ind] = struct.unpack(str(length) + "s", pdf.read(length)) kids_dict[tpon].append(int(ind)) type_addr = fnd(pdf, b"/Type", addr) + 5 tmp_addr = fnd(pdf, b"/", type_addr) + 1 pdf.seek(tmp_addr) [_type] = struct.unpack("5s", pdf.read(5)) if _type == b"Pages": cnt_addr = fnd(pdf, b"/Count ", addr) + 7 pdf.seek(cnt_addr) [_str] = struct.unpack("1s", pdf.read(1)) cnt_len = 0 while _str not in [b" ", b"\r", b"/"]: cnt_len += 1 pdf.seek(cnt_addr + cnt_len) [_str] = struct.unpack("1s", pdf.read(1)) pdf.seek(cnt_addr) [cnt] = struct.unpack(str(cnt_len) + "s", pdf.read(cnt_len)) count_dict[tpon] += int(cnt) else: # _type == b"Page" count_dict[tpon] += 1 kids_no_str = ["{0} 0 R".format(i) for i in kids_dict[tpon]] kids_str = "[{0}]".format(" ".join(kids_no_str)) pages_str = "{0} 0 obj\r<<\r/Type /Pages\r/Kids {1}\r/Count {2}\r>>\rendobj\r".format( tpon, kids_str, count_dict[tpon]) pdf_data += bytes(pages_str, "utf-8") pdf_data += bytes("\n%%EOF\r", "utf-8") with open("pdf.tmp", 'wb') as f: f.write(pdf_data) # Use mutool to repair xref try: check_output(["mutool", "clean", "pdf.tmp", "pdf_toc.pdf"], stderr=STDOUT) except CalledProcessError as e: print(e.output.decode("utf-8")) raise SystemExit("Command mutool returned non-zero exit status " + str(e.returncode)) # Add Outlines add_outlines(self.get_toc(), "pdf_toc.pdf", dest) pdf.close() os.remove("pdf.tmp") os.remove("pdf_toc.pdf")