Ejemplo n.º 1
0
    def init(self, page:fitz.Page) -> Layout:
        '''Initialize layout object.'''
        # Layout object based on raw dict
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page.
        # so, update page width/height to right direction in case page is rotated
        *_, w, h = page.rect # always reflecting page rotation
        raw_layout.update({
            'width' : w,
            'height': h
        })
        self._layout = Layout(raw_layout, page.rotationMatrix)
        
        # get rectangle shapes from page source
        self._layout.rects.from_stream(self.doc_pdf, page)
        
        # get annotations(comment shapes) from PDF page, e.g. 
        # highlight, underline and strike-through-line        
        self._layout.rects.from_annotations(page)

        # plot raw layout
        if self.debug_mode:
            # new section for current pdf page
            new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}')

            # initial layout
            self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT)
            self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE)

        return self._layout
Ejemplo n.º 2
0
    def initialize(self, page: fitz.Page):
        '''Initialize layout object.'''
        # -----------------------------------------
        # Layout object based on raw dict
        # -----------------------------------------
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # -----------------------------------------
        # page size
        # -----------------------------------------
        # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page.
        # so, update page width/height to right direction in case page is rotated
        *_, w, h = page.rect  # always reflecting page rotation
        raw_layout.update({'width': w, 'height': h})

        # -----------------------------------------
        # page images
        # -----------------------------------------
        # image bytes from page.getText('rawdict') can't reproduce transparent images,
        # so we re-extract page images
        for block in raw_layout['blocks']:
            # disable image in raw dict
            if block['type'] == 1: block['type'] = -1

        # extract and recover images
        images = ImagesExtractor.extract_images(page)
        raw_layout['blocks'].extend(images)

        # -----------------------------------------
        # page paths
        # -----------------------------------------
        # convert vector graphic paths to pixmap
        self._paths_extractor = PathsExtractor()
        images, paths = self._paths_extractor.extract_paths(page)
        raw_layout['blocks'].extend(images)
        raw_layout['paths'] = paths

        # init layout
        self._layout = Layout(raw_layout, page.rotationMatrix)

        return self._layout
Ejemplo n.º 3
0
    def _page_blocks(page:fitz.Page):
        '''Get page blocks and adjust image blocks.'''
        # Layout object based on raw dict:
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # Adjust image blocks:
        # Image blocks are generated for every image location – whether or not there are any duplicates. 
        # This is in contrast to Page.getImageList(), which will contain each image only once.
        # https://pymupdf.readthedocs.io/en/latest/textpage.html#dictionary-structure-of-extractdict-and-extractrawdict
        # 
        # So, a compromise:
        # - get image contents with `page.getImageList`
        # - get image location with `page.getText('rawdict')`
        # 
        # extract and recover images
        recovered_images = ImagesExtractor.extract_images(page)

        # group original image blocks by image contents
        image_blocks_group = defaultdict(list)
        for block in raw_layout['blocks']:
            if block['type'] != 1: continue
            image_blocks_group[hash(block['image'])].append(block)

        # update raw layout blocks
        def same_images(img, img_list):
            bbox = list(map(round, img['bbox']))
            for _img in img_list:
                if list(map(round, _img['bbox']))==bbox: return True
            return False

        for image in recovered_images:
            for k, image_blocks in image_blocks_group.items():
                if not same_images(image, image_blocks): continue
                for image_block in image_blocks: image_block['image'] = image['image']
                break

            # an image outside the page is not counted in page.getText(), so let's add it here
            else:
                raw_layout['blocks'].append(image)

        return raw_layout
Ejemplo n.º 4
0
    def initialize(self, page:fitz.Page):
        '''Initialize layout object.'''
        # Layout object based on raw dict
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page.
        # so, update page width/height to right direction in case page is rotated
        *_, w, h = page.rect # always reflecting page rotation
        raw_layout.update({ 'width' : w, 'height': h })
        self._layout = Layout(raw_layout, page.rotationMatrix)
        
        # get rectangle shapes from page source
        self._layout.rects.from_stream(self.doc_pdf, page)
        
        # get annotations(comment shapes) from PDF page, e.g. 
        # highlight, underline and strike-through-line        
        self._layout.rects.from_annotations(page)

        return self._layout
Ejemplo n.º 5
0
    def init(self, page: fitz.Page) -> Layout:
        '''Initialize layout object.'''
        # Layout object based on raw dict
        raw_layout = page.getText('rawdict')
        self._layout = Layout(raw_layout)

        # get rectangle shapes from page source:
        # these shapes are generally converted from docx, e.g. highlight, underline,
        # which are different from PDF comments like highlight, rectangle.
        if not page._isWrapped:
            page._wrapContents()

        # transformation matrix from PDF to PyMuPDF
        M = page.transformationMatrix  # PyMuPDF>=1.17.0

        for xref in page.getContents():
            page_content = self._doc_pdf._getXrefStream(xref).decode(
                encoding="ISO-8859-1")
            self._layout.rects.from_stream(page_content, M)

        # get annotations(comment shapes) from PDF page: consider highlight, underline,
        # strike-through-line only.
        annots = page.annots()
        self._layout.rects.from_annotations(annots)

        # plot raw layout
        if self.debug_mode:
            # new section for current pdf page
            new_page_section(self._doc_debug, self._layout.width,
                             self._layout.height, f'Page {page.number}')

            # initial layout
            self._layout.plot(self._doc_debug,
                              'Original Text Blocks',
                              key=PlotControl.LAYOUT)
            self._layout.plot(self._doc_debug,
                              'Original Rectangle Shapes',
                              key=PlotControl.SHAPE)

        return self._layout
Ejemplo n.º 6
0
    def initialize(self, page:fitz.Page):
        '''Initialize layout object.'''
        # Layout object based on raw dict
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page.
        # so, update page width/height to right direction in case page is rotated
        *_, w, h = page.rect # always reflecting page rotation
        raw_layout.update({ 'width' : w, 'height': h })

        # pdf paths and converted images
        self._paths = PathsExtractor()
        images, paths = self._paths.parse(page).filter_pixmaps(page)
        raw_layout['blocks'].extend(images)
        raw_layout['paths'] = paths

        # init layout
        self._layout = Layout(raw_layout, page.rotationMatrix)    

        return self._layout
Ejemplo n.º 7
0
    def init(self, page:fitz.Page) -> Layout:
        '''Initialize layout object.'''
        # Layout object based on raw dict
        raw_layout = page.getText('rawdict')
        new_layout = split_blocks(raw_layout)
        self._layout = Layout(new_layout)
        
        # get rectangle shapes from page source
        self._layout.rects.from_stream(self.doc_pdf, page)
        
        # get annotations(comment shapes) from PDF page, e.g. 
        # highlight, underline and strike-through-line        
        self._layout.rects.from_annotations(page)

        # plot raw layout
        if self.debug_mode:
            # new section for current pdf page
            new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}')

            # initial layout
            self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT)
            self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE)

        return self._layout