def _convert_hn(self, dest): caj = open(self.filename, "rb") image_list = [] from pdfwutils import Colorspace, ImageFormat, convert_ImageList import zlib for i in range(self.page_num): caj.seek(self._TOC_END_OFFSET + i * 20) [ page_data_offset, size_of_text_section, images_per_page, page_no, unk2, next_page_data_offset ] = struct.unpack("iihhii", caj.read(20)) caj.seek(page_data_offset) text_header_read32 = caj.read(32) if (text_header_read32[8:20] == b'COMPRESSTEXT'): [expanded_text_size] = struct.unpack("i", text_header_read32[20:24]) import zlib caj.seek(page_data_offset + 24) data = caj.read(size_of_text_section - 24) output = zlib.decompress(data, bufsize=expanded_text_size) if (len(output) != expanded_text_size): raise SystemExit("Unexpected:", len(output), expanded_text_size) else: caj.seek(page_data_offset) output = caj.read(size_of_text_section) from HNParsePage import HNParsePage page_style = (next_page_data_offset > page_data_offset) page_data = HNParsePage(output, page_style) if (images_per_page > 1): if (len(page_data.figures) == images_per_page): image_list.append(None) image_list.append(page_data.figures) else: print("Page %d, Image Count %d != %d" % (i + 1, len(page_data.figures), images_per_page)) image_list.append(None) image_list.append(page_data.figures[0:images_per_page]) current_offset = page_data_offset + size_of_text_section for j in range(images_per_page): caj.seek(current_offset) read32 = caj.read(32) [image_type_enum, offset_to_image_data, size_of_image_data] = struct.unpack("iii", read32[0:12]) if (offset_to_image_data != current_offset + 12): raise SystemExit("unusual image offset") caj.seek(offset_to_image_data) image_data = caj.read(size_of_image_data) current_offset = offset_to_image_data + size_of_image_data if (image_type[image_type_enum] == "JBIG"): from jbigdec import CImage cimage = CImage(image_data) out = cimage.DecodeJbig() # PBM is only padded to 8 rather than 32. # If the padding is larger, write padded file. width = cimage.width if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)): width = cimage.bytes_per_line << 3 image_item = (Colorspace.P, (300, 300), ImageFormat.PBM, zlib.compress(out), width, cimage.height, [0xffffff, 0], False, 1, 0) elif (image_type[image_type_enum] == "JBIG2"): from jbig2dec import CImage cimage = CImage(image_data) out = cimage.DecodeJbig2() # PBM is only padded to 8 rather than 32. # If the padding is larger, write padded file. width = cimage.width if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)): width = cimage.bytes_per_line << 3 image_item = (Colorspace.P, (300, 300), ImageFormat.PBM, zlib.compress(out), width, cimage.height, [0xffffff, 0], False, 1, 0) elif (image_type[image_type_enum] == "JPEG"): # stock libjpeg location (SOFn, frame_length, bits_per_pixel, height, width) = struct.unpack(">HHBHH", image_data[158:167]) if (SOFn != 0xFFC0): # "Intel(R) JPEG Library" location (SOFn, frame_length, bits_per_pixel, height, width) = struct.unpack(">HHBHH", image_data[0x272:0x27b]) if (SOFn != 0xFFC0): # neither works, try brute-force import imagesize with open(".tmp.jpg", "wb") as f: f.write(image_data) (width, height) = imagesize.get(".tmp.jpg") os.remove(".tmp.jpg") if (image_type_enum == 1): # non-inverted JPEG Images height = -height image_item = (Colorspace.RGB, (300, 300), ImageFormat.JPEG, image_data, width, height, [], False, 8, 0) else: raise SystemExit("Unknown Image Type %d" % (image_type_enum)) image_list.append(image_item) if (len(image_list) == 0): raise SystemExit("File is pure-text HN; cannot convert to pdf") pdf_data = convert_ImageList(image_list) with open('pdf_toc.pdf', 'wb') as f: f.write(pdf_data) # Add Outlines add_outlines(self.get_toc(), "pdf_toc.pdf", dest) os.remove("pdf_toc.pdf")
def _convert_hn(self, dest): caj = open(self.filename, "rb") image_list = [] from pdfwutils import Colorspace, ImageFormat, convert_ImageList import zlib for i in range(self.page_num): caj.seek(self._TOC_END_OFFSET + i * 20) [ page_data_offset, size_of_text_section, images_per_page, page_no, unk2, unk3 ] = struct.unpack("iihhii", caj.read(20)) current_offset = page_data_offset + size_of_text_section for j in range(images_per_page): caj.seek(current_offset) read32 = caj.read(32) [image_type_enum, offset_to_image_data, size_of_image_data] = struct.unpack("iii", read32[0:12]) if (offset_to_image_data != current_offset + 12): raise SystemExit("unusual image offset") caj.seek(offset_to_image_data) image_data = caj.read(size_of_image_data) current_offset = offset_to_image_data + size_of_image_data image_name = "image_dump_%04d" % (i + 1) if (j > 0): image_name = "image_dump_%04d_%04d" % (i + 1, j) print("TODO: Multiple Images at Page %04d_%04d" % (i + 1, j)) if (image_type[image_type_enum] == "JBIG"): from jbigdec import CImage cimage = CImage(image_data) out = cimage.DecodeJbig() # PBM is only padded to 8 rather than 32. # If the padding is larger, write padded file. width = cimage.width if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)): width = cimage.bytes_per_line << 3 image_list.append( (Colorspace.P, (300, 300), ImageFormat.PBM, zlib.compress(out), width, cimage.height, [0xffffff, 0], False, 1, 0)) elif (image_type[image_type_enum] == "JBIG2"): from jbig2dec import CImage cimage = CImage(image_data) out = cimage.DecodeJbig2() # PBM is only padded to 8 rather than 32. # If the padding is larger, write padded file. width = cimage.width if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)): width = cimage.bytes_per_line << 3 image_list.append( (Colorspace.P, (300, 300), ImageFormat.PBM, zlib.compress(out), width, cimage.height, [0xffffff, 0], False, 1, 0)) elif (image_type[image_type_enum] == "JPEG"): (height, width) = struct.unpack(">HH", image_data[163:167]) image_list.append( (Colorspace.RGB, (300, 300), ImageFormat.JPEG, image_data, width, height, [], False, 8, 0)) if (image_type_enum == 1): print( "TODO: non-inverted JPEG Images at Page %04d_%04d" % (i + 1, j)) pdf_data = convert_ImageList(image_list) with open(dest, 'wb') as f: f.write(pdf_data)
def _convert_hn(self, dest): caj = open(self.filename, "rb") image_list = [] from pdfwutils import Colorspace, ImageFormat, convert_ImageList import zlib for i in range(self.page_num): caj.seek(self._TOC_END_OFFSET + i * 20) [ page_data_offset, size_of_text_section, images_per_page, page_no, unk2, next_page_data_offset ] = struct.unpack("iihhii", caj.read(20)) caj.seek(page_data_offset) text_header_read32 = caj.read(32) if (text_header_read32[8:20] == b'COMPRESSTEXT'): [expanded_text_size] = struct.unpack("i", text_header_read32[20:24]) import zlib caj.seek(page_data_offset + 24) data = caj.read(size_of_text_section - 24) output = zlib.decompress(data, bufsize=expanded_text_size) if (len(output) != expanded_text_size): raise SystemExit("Unexpected:", len(output), expanded_text_size) else: caj.seek(page_data_offset) output = caj.read(size_of_text_section) from HNParsePage import HNParsePage page_style = (next_page_data_offset > page_data_offset) page_data = HNParsePage(output, page_style) if (images_per_page > 1): if (len(page_data.figures) == images_per_page): image_list.append(None) image_list.append(page_data.figures) else: raise SystemExit("Image Count %d != %d" % (len(page_data.figures), images_per_page)) current_offset = page_data_offset + size_of_text_section for j in range(images_per_page): caj.seek(current_offset) read32 = caj.read(32) [image_type_enum, offset_to_image_data, size_of_image_data] = struct.unpack("iii", read32[0:12]) if (offset_to_image_data != current_offset + 12): raise SystemExit("unusual image offset") caj.seek(offset_to_image_data) image_data = caj.read(size_of_image_data) current_offset = offset_to_image_data + size_of_image_data if (image_type[image_type_enum] == "JBIG"): from jbigdec import CImage cimage = CImage(image_data) out = cimage.DecodeJbig() # PBM is only padded to 8 rather than 32. # If the padding is larger, write padded file. width = cimage.width if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)): width = cimage.bytes_per_line << 3 image_item = (Colorspace.P, (300, 300), ImageFormat.PBM, zlib.compress(out), width, cimage.height, [0xffffff, 0], False, 1, 0) elif (image_type[image_type_enum] == "JBIG2"): from jbig2dec import CImage cimage = CImage(image_data) out = cimage.DecodeJbig2() # PBM is only padded to 8 rather than 32. # If the padding is larger, write padded file. width = cimage.width if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)): width = cimage.bytes_per_line << 3 image_item = (Colorspace.P, (300, 300), ImageFormat.PBM, zlib.compress(out), width, cimage.height, [0xffffff, 0], False, 1, 0) elif (image_type[image_type_enum] == "JPEG"): (height, width) = struct.unpack(">HH", image_data[163:167]) if (image_type_enum == 1): # non-inverted JPEG Images height = -height image_item = (Colorspace.RGB, (300, 300), ImageFormat.JPEG, image_data, width, height, [], False, 8, 0) else: raise SystemExit("Unknown Image Type %d" % (image_type_enum)) image_list.append(image_item) pdf_data = convert_ImageList(image_list) with open(dest, 'wb') as f: f.write(pdf_data)