Ejemplo n.º 1
0
    def _convert_hn(self, dest):
        caj = open(self.filename, "rb")
        image_list = []

        from pdfwutils import Colorspace, ImageFormat, convert_ImageList
        import zlib

        for i in range(self.page_num):
            caj.seek(self._TOC_END_OFFSET + i * 20)
            [
                page_data_offset, size_of_text_section, images_per_page,
                page_no, unk2, next_page_data_offset
            ] = struct.unpack("iihhii", caj.read(20))
            caj.seek(page_data_offset)
            text_header_read32 = caj.read(32)
            if (text_header_read32[8:20] == b'COMPRESSTEXT'):
                [expanded_text_size] = struct.unpack("i",
                                                     text_header_read32[20:24])
                import zlib
                caj.seek(page_data_offset + 24)
                data = caj.read(size_of_text_section - 24)
                output = zlib.decompress(data, bufsize=expanded_text_size)
                if (len(output) != expanded_text_size):
                    raise SystemExit("Unexpected:", len(output),
                                     expanded_text_size)
            else:
                caj.seek(page_data_offset)
                output = caj.read(size_of_text_section)
            from HNParsePage import HNParsePage
            page_style = (next_page_data_offset > page_data_offset)
            page_data = HNParsePage(output, page_style)

            if (images_per_page > 1):
                if (len(page_data.figures) == images_per_page):
                    image_list.append(None)
                    image_list.append(page_data.figures)
                else:
                    print("Page %d, Image Count %d != %d" %
                          (i + 1, len(page_data.figures), images_per_page))
                    image_list.append(None)
                    image_list.append(page_data.figures[0:images_per_page])
            current_offset = page_data_offset + size_of_text_section
            for j in range(images_per_page):
                caj.seek(current_offset)
                read32 = caj.read(32)
                [image_type_enum, offset_to_image_data,
                 size_of_image_data] = struct.unpack("iii", read32[0:12])
                if (offset_to_image_data != current_offset + 12):
                    raise SystemExit("unusual image offset")
                caj.seek(offset_to_image_data)
                image_data = caj.read(size_of_image_data)
                current_offset = offset_to_image_data + size_of_image_data
                if (image_type[image_type_enum] == "JBIG"):
                    from jbigdec import CImage
                    cimage = CImage(image_data)
                    out = cimage.DecodeJbig()
                    # PBM is only padded to 8 rather than 32.
                    # If the padding is larger, write padded file.
                    width = cimage.width
                    if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)):
                        width = cimage.bytes_per_line << 3
                    image_item = (Colorspace.P, (300, 300), ImageFormat.PBM,
                                  zlib.compress(out), width, cimage.height,
                                  [0xffffff, 0], False, 1, 0)
                elif (image_type[image_type_enum] == "JBIG2"):
                    from jbig2dec import CImage
                    cimage = CImage(image_data)
                    out = cimage.DecodeJbig2()
                    # PBM is only padded to 8 rather than 32.
                    # If the padding is larger, write padded file.
                    width = cimage.width
                    if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)):
                        width = cimage.bytes_per_line << 3
                    image_item = (Colorspace.P, (300, 300), ImageFormat.PBM,
                                  zlib.compress(out), width, cimage.height,
                                  [0xffffff, 0], False, 1, 0)
                elif (image_type[image_type_enum] == "JPEG"):
                    # stock libjpeg location
                    (SOFn, frame_length, bits_per_pixel, height,
                     width) = struct.unpack(">HHBHH", image_data[158:167])
                    if (SOFn != 0xFFC0):
                        # "Intel(R) JPEG Library" location
                        (SOFn, frame_length, bits_per_pixel, height,
                         width) = struct.unpack(">HHBHH",
                                                image_data[0x272:0x27b])
                        if (SOFn != 0xFFC0):
                            # neither works, try brute-force
                            import imagesize
                            with open(".tmp.jpg", "wb") as f:
                                f.write(image_data)
                                (width, height) = imagesize.get(".tmp.jpg")
                                os.remove(".tmp.jpg")
                    if (image_type_enum == 1):
                        # non-inverted JPEG Images
                        height = -height
                    image_item = (Colorspace.RGB, (300, 300), ImageFormat.JPEG,
                                  image_data, width, height, [], False, 8, 0)
                else:
                    raise SystemExit("Unknown Image Type %d" %
                                     (image_type_enum))
                image_list.append(image_item)
        if (len(image_list) == 0):
            raise SystemExit("File is pure-text HN; cannot convert to pdf")
        pdf_data = convert_ImageList(image_list)
        with open('pdf_toc.pdf', 'wb') as f:
            f.write(pdf_data)
        # Add Outlines
        add_outlines(self.get_toc(), "pdf_toc.pdf", dest)
        os.remove("pdf_toc.pdf")
Ejemplo n.º 2
0
    def _convert_hn(self, dest):
        caj = open(self.filename, "rb")
        image_list = []

        from pdfwutils import Colorspace, ImageFormat, convert_ImageList
        import zlib

        for i in range(self.page_num):
            caj.seek(self._TOC_END_OFFSET + i * 20)
            [
                page_data_offset, size_of_text_section, images_per_page,
                page_no, unk2, unk3
            ] = struct.unpack("iihhii", caj.read(20))
            current_offset = page_data_offset + size_of_text_section
            for j in range(images_per_page):
                caj.seek(current_offset)
                read32 = caj.read(32)
                [image_type_enum, offset_to_image_data,
                 size_of_image_data] = struct.unpack("iii", read32[0:12])
                if (offset_to_image_data != current_offset + 12):
                    raise SystemExit("unusual image offset")
                caj.seek(offset_to_image_data)
                image_data = caj.read(size_of_image_data)
                current_offset = offset_to_image_data + size_of_image_data
                image_name = "image_dump_%04d" % (i + 1)
                if (j > 0):
                    image_name = "image_dump_%04d_%04d" % (i + 1, j)
                    print("TODO: Multiple Images at Page %04d_%04d" %
                          (i + 1, j))
                if (image_type[image_type_enum] == "JBIG"):
                    from jbigdec import CImage
                    cimage = CImage(image_data)
                    out = cimage.DecodeJbig()
                    # PBM is only padded to 8 rather than 32.
                    # If the padding is larger, write padded file.
                    width = cimage.width
                    if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)):
                        width = cimage.bytes_per_line << 3
                    image_list.append(
                        (Colorspace.P, (300, 300), ImageFormat.PBM,
                         zlib.compress(out), width, cimage.height,
                         [0xffffff, 0], False, 1, 0))
                elif (image_type[image_type_enum] == "JBIG2"):
                    from jbig2dec import CImage
                    cimage = CImage(image_data)
                    out = cimage.DecodeJbig2()
                    # PBM is only padded to 8 rather than 32.
                    # If the padding is larger, write padded file.
                    width = cimage.width
                    if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)):
                        width = cimage.bytes_per_line << 3
                    image_list.append(
                        (Colorspace.P, (300, 300), ImageFormat.PBM,
                         zlib.compress(out), width, cimage.height,
                         [0xffffff, 0], False, 1, 0))
                elif (image_type[image_type_enum] == "JPEG"):
                    (height, width) = struct.unpack(">HH", image_data[163:167])
                    image_list.append(
                        (Colorspace.RGB, (300, 300), ImageFormat.JPEG,
                         image_data, width, height, [], False, 8, 0))
                    if (image_type_enum == 1):
                        print(
                            "TODO: non-inverted JPEG Images at Page %04d_%04d"
                            % (i + 1, j))
        pdf_data = convert_ImageList(image_list)
        with open(dest, 'wb') as f:
            f.write(pdf_data)
Ejemplo n.º 3
0
    def _convert_hn(self, dest):
        caj = open(self.filename, "rb")
        image_list = []

        from pdfwutils import Colorspace, ImageFormat, convert_ImageList
        import zlib

        for i in range(self.page_num):
            caj.seek(self._TOC_END_OFFSET + i * 20)
            [
                page_data_offset, size_of_text_section, images_per_page,
                page_no, unk2, next_page_data_offset
            ] = struct.unpack("iihhii", caj.read(20))
            caj.seek(page_data_offset)
            text_header_read32 = caj.read(32)
            if (text_header_read32[8:20] == b'COMPRESSTEXT'):
                [expanded_text_size] = struct.unpack("i",
                                                     text_header_read32[20:24])
                import zlib
                caj.seek(page_data_offset + 24)
                data = caj.read(size_of_text_section - 24)
                output = zlib.decompress(data, bufsize=expanded_text_size)
                if (len(output) != expanded_text_size):
                    raise SystemExit("Unexpected:", len(output),
                                     expanded_text_size)
            else:
                caj.seek(page_data_offset)
                output = caj.read(size_of_text_section)
            from HNParsePage import HNParsePage
            page_style = (next_page_data_offset > page_data_offset)
            page_data = HNParsePage(output, page_style)

            if (images_per_page > 1):
                if (len(page_data.figures) == images_per_page):
                    image_list.append(None)
                    image_list.append(page_data.figures)
                else:
                    raise SystemExit("Image Count %d != %d" %
                                     (len(page_data.figures), images_per_page))
            current_offset = page_data_offset + size_of_text_section
            for j in range(images_per_page):
                caj.seek(current_offset)
                read32 = caj.read(32)
                [image_type_enum, offset_to_image_data,
                 size_of_image_data] = struct.unpack("iii", read32[0:12])
                if (offset_to_image_data != current_offset + 12):
                    raise SystemExit("unusual image offset")
                caj.seek(offset_to_image_data)
                image_data = caj.read(size_of_image_data)
                current_offset = offset_to_image_data + size_of_image_data
                if (image_type[image_type_enum] == "JBIG"):
                    from jbigdec import CImage
                    cimage = CImage(image_data)
                    out = cimage.DecodeJbig()
                    # PBM is only padded to 8 rather than 32.
                    # If the padding is larger, write padded file.
                    width = cimage.width
                    if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)):
                        width = cimage.bytes_per_line << 3
                    image_item = (Colorspace.P, (300, 300), ImageFormat.PBM,
                                  zlib.compress(out), width, cimage.height,
                                  [0xffffff, 0], False, 1, 0)
                elif (image_type[image_type_enum] == "JBIG2"):
                    from jbig2dec import CImage
                    cimage = CImage(image_data)
                    out = cimage.DecodeJbig2()
                    # PBM is only padded to 8 rather than 32.
                    # If the padding is larger, write padded file.
                    width = cimage.width
                    if (cimage.bytes_per_line > ((cimage.width + 7) >> 3)):
                        width = cimage.bytes_per_line << 3
                    image_item = (Colorspace.P, (300, 300), ImageFormat.PBM,
                                  zlib.compress(out), width, cimage.height,
                                  [0xffffff, 0], False, 1, 0)
                elif (image_type[image_type_enum] == "JPEG"):
                    (height, width) = struct.unpack(">HH", image_data[163:167])
                    if (image_type_enum == 1):
                        # non-inverted JPEG Images
                        height = -height
                    image_item = (Colorspace.RGB, (300, 300), ImageFormat.JPEG,
                                  image_data, width, height, [], False, 8, 0)
                else:
                    raise SystemExit("Unknown Image Type %d" %
                                     (image_type_enum))
                image_list.append(image_item)
        pdf_data = convert_ImageList(image_list)
        with open(dest, 'wb') as f:
            f.write(pdf_data)