Exemple #1
0
    def init_backward_sync(self, event):
        if not self.layouter.has_layout: return False
        y_total_pixels = min(
            max(event.y - self.layouter.vertical_margin,
                0), (self.layouter.page_height + self.layouter.page_gap) *
            self.preview.number_of_pages - self.layouter.page_gap)
        x_pixels = min(max(event.x - self.layouter.horizontal_margin, 0),
                       self.layouter.page_width)
        page = math.floor(y_total_pixels /
                          (self.layouter.page_height + self.layouter.page_gap))
        y_pixels = min(
            max(
                y_total_pixels - page *
                (self.layouter.page_height + self.layouter.page_gap), 0),
            self.layouter.page_height)
        x = x_pixels / self.layouter.scale_factor
        y = y_pixels / self.layouter.scale_factor
        page += 1

        with self.preview.poppler_document_lock:
            poppler_page = self.preview.poppler_document.get_page(page - 1)
            rect = Poppler.Rectangle()
            rect.x1 = max(min(x, self.preview.page_width), 0)
            rect.y1 = max(min(y, self.preview.page_height), 0)
            rect.x2 = max(min(x, self.preview.page_width), 0)
            rect.y2 = max(min(y, self.preview.page_height), 0)
            word = poppler_page.get_selected_text(Poppler.SelectionStyle.WORD,
                                                  rect)
            context = poppler_page.get_selected_text(
                Poppler.SelectionStyle.LINE, rect)
        self.preview.document.backward_sync(page, x, y, word, context)
Exemple #2
0
def pdf_area(page, stroke):
    """
    Get PDF page area for a stroke.

    :param page: Poppler PDF page object.
    :param stroke: reMarkable tablet stroke data.
    """
    to_x = lambda p: to_point(p)[0]
    to_y = lambda p: to_point(p)[1]
    x1 = min(to_x(s) for s in stroke.segments)
    x2 = max(to_x(s) for s in stroke.segments)
    y1 = min(to_y(s) for s in stroke.segments)
    y2 = max(to_y(s) for s in stroke.segments)

    factor = pdf_scale(page)

    area = Poppler.Rectangle()
    area.x1 = (x1 - 15) * factor
    area.y1 = y1 * factor
    area.x2 = (x2 + 15) * factor
    area.y2 = y2 * factor
    assert area.x1 < area.x2
    assert area.y1 < area.y2

    return area
Exemple #3
0
    def get_annot_action(self, link_type, action, rect):
        """ Get the function to be called when the link is followed.

        Args:
            link_type (:class:`~Poppler.ActionType`): The link type
            action (:class:`~Poppler.Action`): The action to be performed when the link is clicked
            rect (:class:`~Poppler.Rectangle`): The region of the page where the link is

        Returns:
            `function`: The function to be called to follow the link
        """
        if link_type == Poppler.ActionType.RENDITION:
            media = action.rendition.media
            if media.is_embedded():
                ext = get_extension(media.get_mime_type())
                with tempfile.NamedTemporaryFile('wb',
                                                 suffix=ext,
                                                 prefix='pdf_embed_',
                                                 delete=False) as f:
                    # now the file name is shotgunned
                    filename = f.name
                    self.parent.remove_on_exit(filename)
                if not media.save(filename):
                    logger.error(_("Pympress can not extract embedded media"))
                    return None
            else:
                filename = self.parent.get_full_path(media.get_filename())
                if not filename:
                    logger.error(
                        _("Pympress can not find file ") +
                        media.get_filename())
                    return None

            # TODO grab the show_controls, autoplay, repeat
            relative_margins = Poppler.Rectangle()
            relative_margins.x1 = rect.x1 / self.pw  # left
            relative_margins.x2 = 1.0 - rect.x2 / self.pw  # right
            relative_margins.y1 = rect.y1 / self.ph  # bottom
            relative_margins.y2 = 1.0 - rect.y2 / self.ph  # top

            media = (relative_margins, filename, False)
            self.medias.append(media)
            return Link.build_closure(self.parent.play_media, hash(media))

        else:
            return self.get_link_action(link_type, action)
Exemple #4
0
def main():

    print('Version:', Poppler.get_version())
    path = sys.argv[1]
    if not os.path.isabs(path):
        path = os.path.join(os.getcwd(), path)
    d = Poppler.Document.new_from_file('file:' + path)
    n = d.get_n_pages()
    for pg_no in range(n):
        p = d.get_page(pg_no)
        print('Page %d' % (pg_no + 1), 'size ', p.get_size())
        text = p.get_text().decode('UTF-8')
        locs = get_page_layout(p)
        fonts = p.get_text_attributes()
        offset = 0
        cfont = 0
        for line in text.splitlines(True):
            print(
                ' ',
                line.encode('UTF-8'),
            )
            n = len(line)
            for i in range(n):
                if line[i] == u'\n':
                    continue
                font = fonts[cfont]
                while font.start_index > i + offset or font.end_index < i + offset:
                    cfont += 1
                    if cfont >= len(fonts):
                        font = None
                        break
                    font = fonts[cfont]

                bb = locs[offset + i]
                print(
                    line[i].encode('UTF-8'),
                    '(%0.2f, %0.2f, %0.2f, %0.2f)' % bb,
                )
                if font:
                    print(
                        font.font_name, font.font_size, 'r=%d g=%d, b=%d' %
                        (font.color.red, font.color.green, font.color.blue))
            offset += n
            print()
        print()
Exemple #5
0
    def init_backward_sync(self, event):
        if self.preview.layout == None: return False

        window_width = self.view.get_allocated_width()
        y_total_pixels = min(
            max(event.y, 0),
            (self.preview.layout.page_height + self.preview.layout.page_gap) *
            self.preview.poppler_document.get_n_pages() -
            self.preview.layout.page_gap)
        x_pixels = min(
            max(
                event.x -
                self.preview.layout.get_horizontal_margin(window_width), 0),
            self.preview.layout.page_width)
        page = math.floor(
            y_total_pixels /
            (self.preview.layout.page_height + self.preview.layout.page_gap))
        y_pixels = min(
            max(
                y_total_pixels - page * (self.preview.layout.page_height +
                                         self.preview.layout.page_gap), 0),
            self.preview.layout.page_height)
        x = x_pixels / self.preview.layout.scale_factor
        y = y_pixels / self.preview.layout.scale_factor
        page += 1

        poppler_page = self.preview.poppler_document.get_page(page - 1)
        rect = Poppler.Rectangle()
        rect.x1 = max(min(x, self.preview.page_width), 0)
        rect.y1 = max(min(y, self.preview.page_height), 0)
        rect.x2 = max(min(x, self.preview.page_width), 0)
        rect.y2 = max(min(y, self.preview.page_height), 0)
        word = poppler_page.get_selected_text(Poppler.SelectionStyle.WORD,
                                              rect)
        context = poppler_page.get_selected_text(Poppler.SelectionStyle.LINE,
                                                 rect)
        self.preview.document.build_system.backward_sync(
            page, x, y, word, context)
Exemple #6
0
    def get_annot_action(self, link_type, action, rect):
        """ Get the function to be called when the link is followed.
        """
        if link_type == Poppler.ActionType.RENDITION:
            media = action.rendition.media
            if media.is_embedded():
                ext = get_extension(media.get_mime_type())
                with tempfile.NamedTemporaryFile('wb',
                                                 suffix=ext,
                                                 prefix='pdf_embed_',
                                                 delete=False) as f:
                    # now the file name is shotgunned
                    filename = f.name
                    self.parent.remove_on_exit(filename)
                if not media.save(filename):
                    print(_("Pympress can not extract embedded media"))
                    return None
            else:
                filename = self.parent.get_full_path(media.get_filename())
                if not filename:
                    print(
                        _("Pympress can not find file ") +
                        media.get_filename())
                    return None

            # TODO grab the show_controls, autoplay, repeat
            relative_margins = Poppler.Rectangle()
            relative_margins.x1 = rect.x1 / self.pw  # left
            relative_margins.x2 = 1.0 - rect.x2 / self.pw  # right
            relative_margins.y1 = rect.y1 / self.ph  # bottom
            relative_margins.y2 = 1.0 - rect.y2 / self.ph  # top

            media = (relative_margins, filename, False)
            self.medias.append(media)
            return lambda: pympress.ui.UI.play_media(hash(media))

        else:
            return self.get_link_action(link_type, action)
def main():
    
    print 'Version:', Poppler.get_version()
    path=sys.argv[1]
    if not os.path.isabs(path):
        path=os.path.join(os.getcwd(), path)
    d=Poppler.Document.new_from_file('file:'+path)
    n=d.get_n_pages()
    for pg_no in range(n):
        p=d.get_page(pg_no)
        print 'Page %d' % (pg_no+1), 'size ', p.get_size()
        text=p.get_text().decode('UTF-8')
        locs=get_page_layout(p)
        fonts=p.get_text_attributes()
        offset=0
        cfont=0
        for line in text.splitlines(True):
            print ' ', line.encode('UTF-8'),
            n=len(line)
            for i in range(n):
                if line[i]==u'\n':
                    continue
                font=fonts[cfont]
                while font.start_index > i+offset or font.end_index < i+offset:
                    cfont+=1
                    if cfont>= len(fonts):
                        font=None
                        break
                    font=fonts[cfont]
                
                bb=locs[offset+i]
                print line[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)' % bb,
                if font:
                    print font.font_name, font.font_size, 'r=%d g=%d, b=%d'%(font.color.red, font.color.green, font.color.blue),
            offset+=n
            print
                
        print
Exemple #8
0
    def get_structure(self, index_iter=None):
        """ Gets the structure of the document from its index.

        Recursive, pass the iterator.

        Args:
            index_iter (:class:`~Poppler.IndexIter` or `None`): the iterator for the child index to explore.

        Returns:
            `list`: A list of tuples (depth, page number, title)
        """
        try:
            if index_iter is None:
                index_iter = Poppler.IndexIter(self.doc)
        except TypeError:
            return {}
        if index_iter is None:
            return {}

        index = {}
        while True:
            action = index_iter.get_action()
            title = ''
            try:
                if action.type == Poppler.ActionType.GOTO_DEST:
                    title = action.goto_dest.title
                    if action.goto_dest.dest.type == Poppler.DestType.NAMED:
                        dest = self.parent.doc.find_dest(
                            action.goto_dest.dest.named_dest)
                        page = dest.page_num - 1
                    elif action.goto_dest.dest.type == Poppler.DestType.UNKNOWN:
                        raise AssertionError('Unknown type of destination')
                    else:
                        page = action.goto_dest.dest.page_num - 1
                else:
                    raise AssertionError('Unexpected type of action')
            except Exception:
                logger.error(
                    _('Unexpected action in index "{}"').format(action.type))
                page = None

            new_entry = {'title': title}
            child = index_iter.get_child()
            if child:
                new_entry['children'] = self.get_structure(child)

            # there should not be synonymous sections, correct the page here to a better guess
            if page is None or page in index:
                if 'children' in new_entry:
                    page = min(new_entry['children'])
                else:
                    lower_bound = max(index)
                    find = index[lower_bound]
                    while 'children' in find:
                        lower_bound = max(find)
                        find = find[lower_bound]

                    try:
                        page = min(
                            l for l, n in enumerate(self.page_labels)
                            if n == self.page_labels[page] and l > lower_bound)
                    except ValueError:  # empty iterator
                        page = lower_bound + 1

            index[page] = new_entry

            if not index_iter.next():
                break

        return index
Exemple #9
0
    def __init__(self, page, number, parent):
        self.page = page
        self.page_nb = number
        self.parent = parent
        self.page_label = self.page.get_label()
        self.links = []
        self.medias = []
        self.annotations = []

        # Read page size
        self.pw, self.ph = self.page.get_size()

        # Read links on the page
        for link in self.page.get_link_mapping():
            action = self.get_link_action(link.action.type, link.action)
            my_link = Link(link.area.x1, link.area.y1, link.area.x2,
                           link.area.y2, action)
            self.links.append(my_link)

        # Read annotations, in particular those that indicate media
        for annotation in self.page.get_annot_mapping():
            content = annotation.annot.get_contents()
            if content:
                self.annotations.append(content)

            annot_type = annotation.annot.get_annot_type()
            if annot_type == Poppler.AnnotType.LINK:
                # just an Annot, not subclassed -- probably redundant with links
                continue
            elif annot_type == Poppler.AnnotType.MOVIE:
                movie = annotation.annot.get_movie()
                filepath = self.parent.get_full_path(movie.get_filename())
                if filepath:
                    # TODO there is no autoplay, or repeatCount
                    relative_margins = Poppler.Rectangle()
                    relative_margins.x1 = annotation.area.x1 / self.pw  # left
                    relative_margins.x2 = 1.0 - annotation.area.x2 / self.pw  # right
                    relative_margins.y1 = annotation.area.y1 / self.ph  # bottom
                    relative_margins.y2 = 1.0 - annotation.area.y2 / self.ph  # top
                    media = (relative_margins, filepath, movie.show_controls())
                    self.medias.append(media)
                    action = Link.build_closure(self.parent.play_media,
                                                hash(media))
                else:
                    logger.error(
                        _("Pympress can not find file ") +
                        movie.get_filename())
                    continue
            elif annot_type == Poppler.AnnotType.SCREEN:
                action_obj = annotation.annot.get_action()
                if not action_obj:
                    continue
                action = self.get_annot_action(action_obj.any.type, action_obj,
                                               annotation.area)
                if not action:
                    continue
            elif annot_type == Poppler.AnnotType.FILE_ATTACHMENT:
                attachment = annotation.annot.get_attachment()
                prefix, ext = os.path.splitext(attachment.name)
                with tempfile.NamedTemporaryFile('wb',
                                                 suffix=ext,
                                                 prefix=prefix,
                                                 delete=False) as f:
                    # now the file name is shotgunned
                    filename = f.name
                    self.parent.remove_on_exit(filename)
                if not attachment.save(filename):
                    logger.error(_("Pympress can not extract attached file"))
                    continue
                action = Link.build_closure(fileopen, filename)
            elif annot_type in {
                    Poppler.AnnotType.TEXT, Poppler.AnnotType.POPUP,
                    Poppler.AnnotType.FREE_TEXT
            }:
                # text-only annotations, hide them from screen
                self.page.remove_annot(annotation.annot)
                continue
            elif annot_type in {
                    Poppler.AnnotType.STRIKE_OUT, Poppler.AnnotType.HIGHLIGHT,
                    Poppler.AnnotType.UNDERLINE, Poppler.AnnotType.SQUIGGLY,
                    Poppler.AnnotType.POLYGON, Poppler.AnnotType.POLY_LINE,
                    Poppler.AnnotType.SQUARE, Poppler.AnnotType.CIRCLE,
                    Poppler.AnnotType.CARET, Poppler.AnnotType.LINE,
                    Poppler.AnnotType.STAMP, Poppler.AnnotType.INK
            }:
                # Poppler already renders annotation of these types, nothing more can be done
                # even though the rendering isn't always perfect.
                continue
            else:
                logger.warning(
                    _("Pympress can not interpret annotation of type:") +
                    " {} ".format(annot_type))
                continue

            my_annotation = Link(annotation.area.x1, annotation.area.y1,
                                 annotation.area.x2, annotation.area.y2,
                                 action)
            self.links.append(my_annotation)
Exemple #10
0
    def __init__(self, page, number, parent):
        """
        Args:
            doc (:class:`Poppler.Page`):  the poppler object around the page
            number (integer):  number of the page to fetch in the document
            parent (:class:`pympress.document.Document`):  the parent Document class
        """
        self.page = page
        self.page_nb = number
        self.parent = parent
        self.links = []
        self.medias = []
        self.annotations = []

        # Read page size
        self.pw, self.ph = self.page.get_size()

        # Read links on the page
        for link in self.page.get_link_mapping():
            action = self.get_link_action(link.action.type, link.action)
            my_link = Link(link.area.x1, link.area.y1, link.area.x2,
                           link.area.y2, action)
            self.links.append(my_link)

        # Read annotations, in particular those that indicate media
        for annotation in self.page.get_annot_mapping():
            annot_type = annotation.annot.get_annot_type()
            if annot_type == Poppler.AnnotType.LINK:
                # just an Annot, not subclassed -- probably redundant with links
                continue
            elif annot_type == Poppler.AnnotType.MOVIE:
                movie = annotation.annot.get_movie()
                filepath = self.parent.get_full_path(movie.get_filename())
                if filepath:
                    # TODO there is no autoplay, or repeatCount
                    relative_margins = Poppler.Rectangle()
                    relative_margins.x1 = annotation.area.x1 / self.pw  # left
                    relative_margins.x2 = 1.0 - annotation.area.x2 / self.pw  # right
                    relative_margins.y1 = annotation.area.y1 / self.ph  # bottom
                    relative_margins.y2 = 1.0 - annotation.area.y2 / self.ph  # top
                    media = (relative_margins, filepath, movie.show_controls())
                    self.medias.append(media)
                    action = lambda: pympress.ui.UI.play_media(hash(media))
                else:
                    logger.error(
                        _("Pympress can not find file ") +
                        movie.get_filename())
                    continue
            elif annot_type == Poppler.AnnotType.SCREEN:
                action_obj = annotation.annot.get_action()
                action = self.get_annot_action(action_obj.any.type, action_obj,
                                               annotation.area)
                if not action:
                    continue
            elif annot_type == Poppler.AnnotType.TEXT:
                self.annotations.append(annotation.annot.get_contents())
                # hide post-it sort of button on screen
                self.page.remove_annot(annotation.annot)
                continue
            elif annot_type == Poppler.AnnotType.FREE_TEXT:
                # Poppler already renders annotation of this type
                continue
            else:
                logger.warning(
                    _("Pympress can not interpret annotation of type:") +
                    " {} ".format(annot_type))
                continue

            my_annotation = Link(annotation.area.x1, annotation.area.y1,
                                 annotation.area.x2, annotation.area.y2,
                                 action)
            self.links.append(my_annotation)
Exemple #11
0
import os
import re
import logging
import tempfile
import io
from typing import Dict, Union
from distutils.version import LooseVersion

import cairo
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Poppler, GLib

from . import abstract

poppler_version = Poppler.get_version()
if LooseVersion(poppler_version) < LooseVersion('0.46'):  # pragma: no cover
    raise ValueError("mat2 needs at least Poppler version 0.46 to work. \
The installed version is %s." % poppler_version)  # pragma: no cover


class PDFParser(abstract.AbstractParser):
    mimetypes = {
        'application/pdf',
    }
    meta_list = {
        'author', 'creation-date', 'creator', 'format', 'keywords', 'metadata',
        'mod-date', 'producer', 'subject', 'title', 'viewer-preferences'
    }

    def __init__(self, filename):