Python DictionaryObject 예제들, PyPDF2.generic.DictionaryObject Python 예제들

예제 #1

0

파일 보기

파일: PyPDF2Highlight.py 프로젝트: MartinThoma/algorithms

def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]):
    newHighlight = DictionaryObject()

    newHighlight.update({
        NameObject("/F"): NumberObject(4),
        NameObject("/Type"): NameObject("/Annot"),
        NameObject("/Subtype"): NameObject("/Highlight"),

        NameObject("/T"): TextStringObject(meta["author"]),
        NameObject("/Contents"): TextStringObject(meta["contents"]),

        NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"): ArrayObject([
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y2)
        ]),
        NameObject("/QuadPoints"): ArrayObject([
            FloatObject(x1),
            FloatObject(y2),
            FloatObject(x2),
            FloatObject(y2),
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y1)
        ]),
    })

    return newHighlight

예제 #2

0

파일 보기

def test_DictionaryObject_read_from_stream_broken():
    stream = BytesIO(b"< /S /GoTo >>")
    pdf = None
    with pytest.raises(PdfReadError) as exc:
        DictionaryObject.read_from_stream(stream, pdf)
    assert (exc.value.args[0] ==
            "Dictionary read error at byte 0x2: stream must begin with '<<'")

예제 #3

0

파일 보기

파일: splitter_tmp.py 프로젝트: chebrolu/Question-Bank

def create_annot_box(x1, y1, x2, y2, meta, color=[1, 0, 0]):
    new_annot = DictionaryObject()

    new_annot.update({
        # NameObject("/P"): parent,
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Square"),
        NameObject("/T"):
        TextStringObject(meta["author"]),
        NameObject("/Contents"):
        TextStringObject(meta["contents"]),
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y2)
        ]),
    })
    return new_annot

예제 #4

0

파일 보기

파일: PyPDF2Highlight.py 프로젝트: kod2nd/poor-man-redactor

def createHighlight(x0, y0, x1, y1, color=[0, 0, 0]):
    newHighlight = DictionaryObject()

    newHighlight.update({
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Highlight"),
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([
            FloatObject(x0),
            FloatObject(y0),
            FloatObject(x1),
            FloatObject(y1)
        ]),
        NameObject("/QuadPoints"):
        ArrayObject([
            FloatObject(x0),
            FloatObject(y1),
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x0),
            FloatObject(y0),
            FloatObject(x1),
            FloatObject(y0)
        ]),
    })

    return newHighlight

예제 #5

0

파일 보기

    def createHighlight(self,x1, y1, x2, y2, meta, color = [1, 0, 0]):
        newHighlight = DictionaryObject()

        newHighlight.update({
            NameObject("/F"): NumberObject(4),
            NameObject("/Type"): NameObject("/Annot"),
            NameObject("/Subtype"): NameObject("/Highlight"),

            NameObject("/T"): TextStringObject(meta["author"]),
            NameObject("/Contents"): TextStringObject(meta["contents"]),

            NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
            NameObject("/Rect"): ArrayObject([
                FloatObject(x1),
                FloatObject(y1),
                FloatObject(x2),
                FloatObject(y2)
            ]),
            NameObject("/QuadPoints"): ArrayObject([
                FloatObject(x1),
                FloatObject(y2),
                FloatObject(x2),
                FloatObject(y2),
                FloatObject(x1),
                FloatObject(y1),
                FloatObject(x2),
                FloatObject(y1)
            ]),
        })

        return newHighlight

예제 #6

0

파일 보기

파일: pdfutils.py 프로젝트: workos/winterpy

def add_xobject_to_page(page, obj_id):
    res = page.setdefault(NameObject('/Resources'), DictionaryObject())
    xo = res.setdefault(NameObject('/XObject'), DictionaryObject())
    seq = 0
    while True:
        name = NameObject('/img_%s' % seq)
        if name not in xo:
            xo[name] = obj_id
            return name
        seq += 1

예제 #7

0

파일 보기

def test_DictionaryObject_read_from_stream_stream_no_stream_length(strict):
    stream = BytesIO(b"<< /S /GoTo >>stream\n")

    class Tst:  # to replace pdf
        strict = False

    pdf = Tst()
    pdf.strict = strict
    with pytest.raises(PdfReadError) as exc:
        DictionaryObject.read_from_stream(stream, pdf)
    assert exc.value.args[0] == "Stream length not defined"

예제 #8

0

파일 보기

def createHighlight(bbox=(0, 0, 1, 1),
                    contents="",
                    color=[1, 1, 0],
                    author="iwasakishuto(@cabernet_rock)"):
    """Create a Highlight

    Args:
        bbox (tuple)   : a bounding box showing the location of highlight.
        contents (str) : Text comments for a highlight label.
        color (list)   : Highlight color. Defaults to ``[1,1,0]``. (yellow)
        author (str)   : Who wrote the annotation (comment). Defaults to ``"iwasakishuto(@cabernet_rock)"`` .

    Returns:
        DictionaryObject: Highlight information.

    Examples:
        >>> from gummy.utils import createHighlight, addHighlightToPage
        >>> from PyPDF2 import PdfFileWriter, PdfFileReader
        >>> page_no = 0
        >>> pdfOutput = PdfFileWriter()
        >>> with open("input.pdf", mode="rb") as inPdf:
        ...     pdfInput = PdfFileReader(inPdf)
        ...     page = pdfInput.getPage(page_no)
        ...     highlight = createHighlight(bbox=(10,10,90,90), contents="COMMENT", color=(1,1,0))
        ...     addHighlightToPage(highlight, page, pdfOutput)
        ...     pdfOutput.addPage(page)
        ...     with open("output.pdf", mode="wb") as outPdf:
        ...         pdfOutput.write(outPdf)
    """
    from PyPDF2.generic import (DictionaryObject, NumberObject, FloatObject,
                                NameObject, TextStringObject, ArrayObject)
    x1, y1, x2, y2 = bbox
    newHighlight = DictionaryObject()
    newHighlight.update({
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Highlight"),
        NameObject("/T"):
        TextStringObject(author),
        NameObject("/Contents"):
        TextStringObject(contents),
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([FloatObject(e) for e in bbox]),
        NameObject("/QuadPoints"):
        ArrayObject([FloatObject(e)
                     for e in [x1, y2, x2, y2, x1, y1, x2, y1]]),
    })
    return newHighlight

예제 #9

0

파일 보기

파일: COISearchEngine.py 프로젝트: Berni1557/COISearchEngine

    def create_highlight(self, x1, y1, x2, y2, meta, color=[0, 1, 0]):
        """
        Create a highlight for a PDF.

        Parameters
        ----------
        x1, y1 : float
            bottom left corner
        x2, y2 : float
            top right corner
        meta : dict
            keys are "author" and "contents"
        color : iterable
            Three elements, (r,g,b)
        """
        new_highlight = DictionaryObject()

        new_highlight.update({
            NameObject("/F"):
            NumberObject(4),
            NameObject("/Type"):
            NameObject("/Annot"),
            NameObject("/Subtype"):
            NameObject("/Highlight"),
            NameObject("/T"):
            TextStringObject(meta["author"]),
            NameObject("/Contents"):
            TextStringObject(meta["contents"]),
            NameObject("/C"):
            ArrayObject([FloatObject(c) for c in color]),
            NameObject("/Rect"):
            ArrayObject([
                FloatObject(x1),
                FloatObject(y1),
                FloatObject(x2),
                FloatObject(y2)
            ]),
            NameObject("/QuadPoints"):
            ArrayObject([
                FloatObject(x1),
                FloatObject(y2),
                FloatObject(x2),
                FloatObject(y2),
                FloatObject(x1),
                FloatObject(y1),
                FloatObject(x2),
                FloatObject(y1)
            ]),
        })

        return new_highlight

예제 #10

0

파일 보기

파일: pdf_highlighter.py 프로젝트: dmitrySorokin/pdf_highlighter

def _create_annotation(x1, y1, x2, y2, color, subtype):
    annotation = DictionaryObject()

    annotation.update({
        NameObject('/Subtype'): NameObject(subtype),
        NameObject('/C'): ArrayObject([FloatObject(c) for c in color]),
        NameObject('/Rect'): ArrayObject([
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y2)]),
    })

    return annotation

예제 #11

0

파일 보기

def _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict,
                                     file_dict, file_bin):
    filename = file_dict['filename']
    logger.debug('_filespec_additional_attachments filename=%s', filename)
    mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date'])
    md5sum = hashlib.md5(file_bin).hexdigest()
    md5sum_obj = createStringObject(md5sum)
    params_dict = DictionaryObject({
        NameObject('/CheckSum'):
        md5sum_obj,
        NameObject('/ModDate'):
        createStringObject(mod_date_pdf),
        NameObject('/Size'):
        NameObject(str(len(file_bin))),
    })
    file_entry = DecodedStreamObject()
    file_entry.setData(file_bin)
    file_mimetype = mimetypes.guess_type(filename)[0]
    if not file_mimetype:
        file_mimetype = 'application/octet-stream'
    file_mimetype_insert = '/' + file_mimetype.replace('/', '#2f')
    file_entry.update({
        NameObject("/Type"): NameObject("/EmbeddedFile"),
        NameObject("/Params"): params_dict,
        NameObject("/Subtype"): NameObject(file_mimetype_insert),
    })
    file_entry_obj = pdf_filestream._addObject(file_entry)
    ef_dict = DictionaryObject({
        NameObject("/F"): file_entry_obj,
    })
    fname_obj = createStringObject(filename)
    filespec_dict = DictionaryObject({
        NameObject("/AFRelationship"):
        NameObject("/Unspecified"),
        NameObject("/Desc"):
        createStringObject(file_dict.get('desc', '')),
        NameObject("/Type"):
        NameObject("/Filespec"),
        NameObject("/F"):
        fname_obj,
        NameObject("/EF"):
        ef_dict,
        NameObject("/UF"):
        fname_obj,
    })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
    name_arrayobj_cdict[fname_obj] = filespec_obj

예제 #12

0

파일 보기

파일: document.py 프로젝트: glins97/PPA

    def _create_highlight(self,
                          x0,
                          y0,
                          width,
                          height,
                          comment,
                          author='',
                          color=[0, 0, 0, 0]):
        self.add_rect(x0, y0, width, height)
        highlight = DictionaryObject()

        highlight.update({
            NameObject("/F"):
            NumberObject(4),
            NameObject("/Type"):
            NameObject("/Annot"),
            NameObject("/Subtype"):
            NameObject("/Highlight"),
            NameObject("/T"):
            TextStringObject(author),
            NameObject("/Contents"):
            TextStringObject(comment),
            NameObject("/C"):
            ArrayObject([FloatObject(c) for c in color]),
            NameObject("/Rect"):
            ArrayObject([
                FloatObject(x0),
                FloatObject(y0),
                FloatObject(x0 + width),
                FloatObject(y0 + width)
            ]),
            NameObject("/QuadPoints"):
            ArrayObject([
                FloatObject(x0),
                FloatObject(y0 + width),
                FloatObject(x0 + width),
                FloatObject(y0 + width),
                FloatObject(x0),
                FloatObject(y0),
                FloatObject(x0 + width),
                FloatObject(y0)
            ]),
        })

        return highlight

예제 #13

0

파일 보기

파일: test_filters.py 프로젝트: mstamy2/PyPDF2

def test_FlateDecode(predictor, s):
    """
    Tests FlateDecode decode() and encode() methods.
    """
    codec = FlateDecode()
    s = s.encode()
    encoded = codec.encode(s)
    assert codec.decode(encoded, DictionaryObject({"/Predictor":
                                                   predictor})) == s

예제 #14

0

파일 보기

파일: pdf.py 프로젝트: mausvt/flectra

    def addAttachment(self, name, data, subtype=None):
        """
        Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules.
        :param name: The name of the attachement
        :param data: The data of the attachement
        :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise.
        It should take the form of "/xxx#2Fxxx". E.g. for "text/xml": "/text#2Fxml"
        """
        adapted_subtype = subtype
        if subtype:
            # If we receive the subtype in an 'unformated' (mimetype) format, we'll try to convert it to a pdf-valid one
            if REGEX_SUBTYPE_UNFORMATED.match(subtype):
                adapted_subtype = '/' + subtype.replace('/', '#2F')

            if not REGEX_SUBTYPE_FORMATED.match(adapted_subtype):
                # The subtype still does not match the correct format, so we will not add it to the document
                _logger.warning(
                    "Attempt to add an attachment with the incorrect subtype '%s'. The subtype will be ignored.",
                    subtype)
                adapted_subtype = ''

        attachment = self._create_attachment_object({
            'filename': name,
            'content': data,
            'subtype': adapted_subtype,
        })
        if self._root_object.get('/Names') and self._root_object['/Names'].get(
                '/EmbeddedFiles'):
            names_array = self._root_object["/Names"]["/EmbeddedFiles"][
                "/Names"]
            names_array.extend([attachment.getObject()['/F'], attachment])
        else:
            names_array = ArrayObject()
            names_array.extend([attachment.getObject()['/F'], attachment])

            embedded_files_names_dictionary = DictionaryObject()
            embedded_files_names_dictionary.update(
                {NameObject("/Names"): names_array})
            embedded_files_dictionary = DictionaryObject()
            embedded_files_dictionary.update({
                NameObject("/EmbeddedFiles"):
                embedded_files_names_dictionary
            })
            self._root_object.update(
                {NameObject("/Names"): embedded_files_dictionary})

        if self._root_object.get('/AF'):
            attachment_array = self._root_object['/AF']
            attachment_array.extend([attachment])
        else:
            # Create a new object containing an array referencing embedded file
            # And reference this array in the root catalogue
            attachment_array = self._addObject(ArrayObject([attachment]))
            self._root_object.update({NameObject("/AF"): attachment_array})

예제 #15

0

파일 보기

파일: utils.py 프로젝트: esantus/ConceptClassification

def createHighlight(x1, y1, x2, y2, meta, color=[1, 0, 0]):
    '''
	Create a highlight object which will be applied to a box in a PDF page (please,
	notice that coordinates start in the bottom left) with specific metadata and
	colors.
	'''
    newHighlight = DictionaryObject()

    newHighlight.update({
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Highlight"),
        NameObject("/T"):
        TextStringObject(meta["author"]),
        NameObject("/Contents"):
        TextStringObject(meta["contents"]),
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y2)
        ]),
        NameObject("/QuadPoints"):
        ArrayObject([
            FloatObject(x1),
            FloatObject(y2),
            FloatObject(x2),
            FloatObject(y2),
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y1)
        ]),
    })
    return newHighlight

예제 #16

0

파일 보기

파일: _encryption.py 프로젝트: mstamy2/PyPDF2

    def read(encryption_entry: DictionaryObject,
             first_id_entry: bytes) -> "Encryption":
        filter = encryption_entry.get("/Filter")
        if filter != "/Standard":
            raise NotImplementedError(
                "only Standard PDF encryption handler is available")
        if "/SubFilter" in encryption_entry:
            raise NotImplementedError("/SubFilter NOT supported")

        StmF = "/V2"
        StrF = "/V2"
        EFF = "/V2"

        V = encryption_entry.get("/V", 0)
        if V not in (1, 2, 3, 4, 5):
            raise NotImplementedError("Encryption V=%d NOT supported" % V)
        if V >= 4:
            filters = encryption_entry["/CF"]

            StmF = encryption_entry.get("/StmF", "/Identity")
            StrF = encryption_entry.get("/StrF", "/Identity")
            EFF = encryption_entry.get("/EFF", StmF)

            if StmF != "/Identity":
                StmF = filters[StmF]["/CFM"]  # type: ignore
            if StrF != "/Identity":
                StrF = filters[StrF]["/CFM"]  # type: ignore
            if EFF != "/Identity":
                EFF = filters[EFF]["/CFM"]  # type: ignore

            allowed_methods = ("/Identity", "/V2", "/AESV2", "/AESV3")
            if StmF not in allowed_methods:
                raise NotImplementedError("StmF Method {StmF} NOT supported!")
            if StrF not in allowed_methods:
                raise NotImplementedError(f"StrF Method {StrF} NOT supported!")
            if EFF not in allowed_methods:
                raise NotImplementedError(f"EFF Method {EFF} NOT supported!")

        R = cast(int, encryption_entry["/R"])
        return Encryption(V, R, encryption_entry, first_id_entry, StmF, StrF,
                          EFF)

예제 #17

0

파일 보기

파일: pdfmanipulator.py 프로젝트: jijoy/pdfmanipulator

def _markup_annotation(rect, contents=None, author=None, subject=None,
                       color=None, alpha=1, flag=4):
    """Set shared properties of all markup annotations."""
    
    retval = DictionaryObject({ NameObject('/CA'): FloatObject(alpha),
                                NameObject('/F'): NumberObject(flag),
                                NameObject('/Rect'): float_array(rect),
                                NameObject('/Type'): NameObject('/Annot'),
                                NameObject('/CreationDate'): now(),
                                NameObject('/M'): now(),
                             })
    retval.popup = False  # Whether to add an explicit popup when adding to page
    if contents is not None:
        retval[NameObject('/Contents')] = TextStringObject(contents)
    if author is not None:
        retval[NameObject('/T')] = TextStringObject(author)
    if subject is not None:
        retval[NameObject('/Subj')] = TextStringObject(subject)
    if color is not None:
        retval[NameObject('/C')] = float_array(color)
    return retval

예제 #18

0

파일 보기

파일: test_filters.py 프로젝트: mstamy2/PyPDF2

def test_FlateDecode_unsupported_predictor():
    """
    Inputs an unsupported predictor (outside the [10, 15] range) checking
    that PdfReadError() is raised. Once this predictor support is updated
    in the future, this test case may be removed.
    """
    codec = FlateDecode()
    predictors = (-10, -1, 0, 9, 16, 20, 100)

    for predictor, s in cartesian_product(predictors, filter_inputs):
        s = s.encode()
        with pytest.raises(PdfReadError):
            codec.decode(codec.encode(s),
                         DictionaryObject({"/Predictor": predictor}))

예제 #19

0

파일 보기

    def addAttachment(self, name, data, subtype=""):
        """
        Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules.
        :param name: The name of the attachement
        :param data: The data of the attachement
        :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise.
        It should take the form of "/xxx%2Fxxx". E.g. for "text/xml": "/text%2Fxml"
        """
        if subtype == 'application/xml':
            subtype = '/application#2Fxml'

        attachment = self._create_attachment_object({
            'filename': name,
            'content': data,
            'subtype': subtype,
        })
        if self._root_object.get('/Names') and self._root_object['/Names'].get(
                '/EmbeddedFiles'):
            names_array = self._root_object["/Names"]["/EmbeddedFiles"][
                "/Names"]
            names_array.extend([attachment.getObject()['/F'], attachment])
        else:
            names_array = ArrayObject()
            names_array.extend([attachment.getObject()['/F'], attachment])

            embedded_files_names_dictionary = DictionaryObject()
            embedded_files_names_dictionary.update(
                {NameObject("/Names"): names_array})
            embedded_files_dictionary = DictionaryObject()
            embedded_files_dictionary.update({
                NameObject("/EmbeddedFiles"):
                embedded_files_names_dictionary
            })
            self._root_object.update(
                {NameObject("/Names"): embedded_files_dictionary})

        if self._root_object.get('/AF'):
            attachment_array = self._root_object['/AF']
            attachment_array.extend([attachment])
        else:
            # Create a new object containing an array referencing embedded file
            # And reference this array in the root catalogue
            attachment_array = self._addObject(ArrayObject([attachment]))
            self._root_object.update({NameObject("/AF"): attachment_array})

예제 #20

0

파일 보기

파일: test_filters.py 프로젝트: mstamy2/PyPDF2

def test_CCITTFaxDecode():
    data = b""
    parameters = DictionaryObject({
        "/K": NumberObject(-1),
        "/Columns": NumberObject(17)
    })

    # This was just the result PyPDF2 1.27.9 returned.
    # It would be awesome if we could check if that is actually correct.
    assert CCITTFaxDecode.decode(data, parameters) == (
        b"II*\x00\x08\x00\x00\x00\x08\x00\x00\x01\x04\x00\x01\x00\x00\x00\x11\x00"
        b"\x00\x00\x01\x01\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x01"
        b"\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x01\x03\x00\x01\x00"
        b"\x00\x00\x04\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x00\x00"
        b"\x00\x00\x11\x01\x04\x00\x01\x00\x00\x00l\x00\x00\x00\x16\x01"
        b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00"
        b"\x00\x00\x00\x00\x00\x00\x00\x00")

예제 #21

0

파일 보기

파일: pdf.py 프로젝트: mausvt/flectra

    def _create_attachment_object(self, attachment):
        ''' Create a PyPdf2.generic object representing an embedded file.

        :param attachment: A dictionary containing:
            * filename: The name of the file to embed (required)
            * content:  The bytes of the file to embed (required)
            * subtype: The mime-type of the file to embed (optional)
        :return:
        '''
        file_entry = DecodedStreamObject()
        file_entry.setData(attachment['content'])
        file_entry.update({
            NameObject("/Type"):
            NameObject("/EmbeddedFile"),
            NameObject("/Params"):
            DictionaryObject({
                NameObject('/CheckSum'):
                createStringObject(md5(attachment['content']).hexdigest()),
                NameObject('/ModDate'):
                createStringObject(
                    datetime.now().strftime(DEFAULT_PDF_DATETIME_FORMAT)),
                NameObject('/Size'):
                NameObject(str(len(attachment['content']))),
            }),
        })
        if attachment.get('subtype'):
            file_entry.update({
                NameObject("/Subtype"):
                NameObject(attachment['subtype']),
            })
        file_entry_object = self._addObject(file_entry)
        filename_object = createStringObject(attachment['filename'])
        filespec_object = DictionaryObject({
            NameObject("/AFRelationship"):
            NameObject("/Data"),
            NameObject("/Type"):
            NameObject("/Filespec"),
            NameObject("/F"):
            filename_object,
            NameObject("/EF"):
            DictionaryObject({
                NameObject("/F"): file_entry_object,
                NameObject('/UF'): file_entry_object,
            }),
            NameObject("/UF"):
            filename_object,
        })
        if attachment.get('description'):
            filespec_object.update({
                NameObject("/Desc"):
                createStringObject(attachment['description'])
            })
        return self._addObject(filespec_object)

예제 #22

0

파일 보기

def add_comment(output, page, text, rectangle):
    obj = output._addObject(
        DictionaryObject({
            NameObject('/DA'):
            TextStringObject(' /Helv 10 Tf'),
            NameObject('/Subtype'):
            NameObject('/FreeText'),
            NameObject('/Rect'):
            RectangleObject(rectangle),
            NameObject('/Type'):
            NameObject('/Annot'),
            NameObject('/Contents'):
            TextStringObject(text),
            NameObject('/C'):
            ArrayObject([FloatObject(1),
                         FloatObject(1),
                         FloatObject(1)]),
        }))
    page['/Annots'].append(obj)

예제 #23

0

파일 보기

def test_DictionaryObject_read_from_stream_stream_stream_valid(
        strict, length, should_fail):
    stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" %
                     length)

    class Tst:  # to replace pdf
        strict = True

    pdf = Tst()
    pdf.strict = strict
    with pytest.raises(PdfReadError) as exc:
        do = DictionaryObject.read_from_stream(stream, pdf)
        # TODO: What should happen with the stream?
        assert do == {"/S": "/GoTo"}
        if length in (6, 10):
            assert b"BT /F1" in do._StreamObject__data
        raise PdfReadError("__ALLGOOD__")
    print(exc.value)
    assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__")

예제 #24

0

파일 보기

파일: handlePDF.py 프로젝트: cadnant/oomap

def get_pdf_measure(m, gcs, poly, bounds_default):
    """
    Returns the PDF Measure dictionary.
    The Measure dictionary is used in the viewport array
    and specifies the scale and units that apply to the output map.
    """
    measure = DictionaryObject()
    measure[NameObject('/Type')] = NameObject('/Measure')
    measure[NameObject('/Subtype')] = NameObject('/GEO')
    bounds = ArrayObject()
    """
    Returns the PDF BOUNDS array.
    The PDF's bounds array is equivalent to the map's neatline, i.e.,
    the border delineating the extent of geographic data on the output map.
    """
    for x in [0, 1, 0, 0, 1, 0, 1, 1]:
        bounds.append(FloatObject(str(x)))

    measure[NameObject('/Bounds')] = bounds
    measure[NameObject('/GPTS')] = get_pdf_gpts(m, poly)
    measure[NameObject('/LPTS')] = bounds
    measure[NameObject('/GCS')] = gcs
    return measure

예제 #25

0

파일 보기

파일: _encryption.py 프로젝트: mstamy2/PyPDF2

    def __init__(
        self,
        algV: int,
        algR: int,
        entry: DictionaryObject,
        first_id_entry: bytes,
        StmF: str,
        StrF: str,
        EFF: str,
    ) -> None:
        # See TABLE 3.18 Entries common to all encryption dictionaries
        self.algV = algV
        self.algR = algR
        self.entry = entry
        self.key_size = entry.get("/Length", 40)
        self.id1_entry = first_id_entry
        self.StmF = StmF
        self.StrF = StrF
        self.EFF = EFF

        # 1 => owner password
        # 2 => user password
        self._password_type = PasswordType.NOT_DECRYPTED
        self._key: Optional[bytes] = None

예제 #26

0

파일 보기

    def zugferd_update_metadata_add_attachment(self, pdf_filestream, fname,
                                               fdata):
        '''This method is inspired from the code of the addAttachment()
        method of the PyPDF2 lib'''
        # The entry for the file
        moddate = DictionaryObject()
        moddate.update({
            NameObject('/ModDate'):
            createStringObject(self._get_pdf_timestamp())
        })
        file_entry = DecodedStreamObject()
        file_entry.setData(fdata)
        file_entry.update({
            NameObject("/Type"):
            NameObject("/EmbeddedFile"),
            NameObject("/Params"):
            moddate,
            # 2F is '/' in hexadecimal
            NameObject("/Subtype"):
            NameObject("/text#2Fxml"),
        })
        file_entry_obj = pdf_filestream._addObject(file_entry)
        # The Filespec entry
        efEntry = DictionaryObject()
        efEntry.update({
            NameObject("/F"): file_entry_obj,
            NameObject('/UF'): file_entry_obj,
        })

        fname_obj = createStringObject(fname)
        filespec = DictionaryObject()
        filespec.update({
            NameObject("/AFRelationship"):
            NameObject("/Alternative"),
            NameObject("/Desc"):
            createStringObject("ZUGFeRD Invoice"),
            NameObject("/Type"):
            NameObject("/Filespec"),
            NameObject("/F"):
            fname_obj,
            NameObject("/EF"):
            efEntry,
            NameObject("/UF"):
            fname_obj,
        })
        embeddedFilesNamesDictionary = DictionaryObject()
        embeddedFilesNamesDictionary.update({
            NameObject("/Names"):
            ArrayObject([fname_obj,
                         pdf_filestream._addObject(filespec)])
        })
        # Then create the entry for the root, as it needs a
        # reference to the Filespec
        embeddedFilesDictionary = DictionaryObject()
        embeddedFilesDictionary.update(
            {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary})
        # Update the root
        metadata_xml_str = self._prepare_pdf_metadata()
        metadata_file_entry = DecodedStreamObject()
        metadata_file_entry.setData(metadata_xml_str)
        metadata_value = pdf_filestream._addObject(metadata_file_entry)
        af_value = pdf_filestream._addObject(
            ArrayObject([pdf_filestream._addObject(filespec)]))
        pdf_filestream._root_object.update({
            NameObject("/AF"):
            af_value,
            NameObject("/Metadata"):
            metadata_value,
            NameObject("/Names"):
            embeddedFilesDictionary,
        })
        info_dict = self._prepare_pdf_info()
        pdf_filestream.addMetadata(info_dict)

예제 #27

0

파일 보기

    def convert_to_pdfa(self):
        """
        Transform the opened PDF file into a PDF/A compliant file
        """
        # Set the PDF version to 1.7 (as PDF/A-3 is based on version 1.7) and make it PDF/A compliant.
        # See https://github.com/veraPDF/veraPDF-validation-profiles/wiki/PDFA-Parts-2-and-3-rules#rule-612-1

        # " The file header shall begin at byte zero and shall consist of "%PDF-1.n" followed by a single EOL marker,
        # where 'n' is a single digit number between 0 (30h) and 7 (37h) "
        # " The aforementioned EOL marker shall be immediately followed by a % (25h) character followed by at least four
        # bytes, each of whose encoded byte values shall have a decimal value greater than 127 "
        self._header = b"%PDF-1.7\n%\xFF\xFF\xFF\xFF"

        # Add a document ID to the trailer. This is only needed when using encryption with regular PDF, but is required
        # when using PDF/A
        pdf_id = ByteStringObject(md5(self._reader.stream.getvalue()).digest())
        # The first string is based on the content at the time of creating the file, while the second is based on the
        # content of the file when it was last updated. When creating a PDF, both are set to the same value.
        self._ID = ArrayObject((pdf_id, pdf_id))

        with file_open('tools/data/files/sRGB2014.icc', mode='rb') as icc_profile:
            icc_profile_file_data = compress(icc_profile.read())

        icc_profile_stream_obj = DecodedStreamObject()
        icc_profile_stream_obj.setData(icc_profile_file_data)
        icc_profile_stream_obj.update({
            NameObject("/Filter"): NameObject("/FlateDecode"),
            NameObject("/N"): NumberObject(3),
            NameObject("/Length"): NameObject(str(len(icc_profile_file_data))),
        })

        icc_profile_obj = self._addObject(icc_profile_stream_obj)

        output_intent_dict_obj = DictionaryObject()
        output_intent_dict_obj.update({
            NameObject("/S"): NameObject("/GTS_PDFA1"),
            NameObject("/OutputConditionIdentifier"): createStringObject("sRGB"),
            NameObject("/DestOutputProfile"): icc_profile_obj,
            NameObject("/Type"): NameObject("/OutputIntent"),
        })

        output_intent_obj = self._addObject(output_intent_dict_obj)
        self._root_object.update({
            NameObject("/OutputIntents"): ArrayObject([output_intent_obj]),
        })

        pages = self._root_object['/Pages']['/Kids']

        # PDF/A needs the glyphs width array embedded in the pdf to be consistent with the ones from the font file.
        # But it seems like it is not the case when exporting from wkhtmltopdf.
        if TTFont:
            fonts = {}
            # First browse through all the pages of the pdf file, to get a reference to all the fonts used in the PDF.
            for page in pages:
                for font in page.getObject()['/Resources']['/Font'].values():
                    for descendant in font.getObject()['/DescendantFonts']:
                        fonts[descendant.idnum] = descendant.getObject()

            # Then for each font, rewrite the width array with the information taken directly from the font file.
            # The new width are calculated such as width = round(1000 * font_glyph_width / font_units_per_em)
            # See: http://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/
            for font in fonts.values():
                font_file = font['/FontDescriptor']['/FontFile2']
                stream = io.BytesIO(decompress(font_file._data))
                ttfont = TTFont(stream)
                font_upm = ttfont['head'].unitsPerEm
                glyphs = ttfont.getGlyphSet()._hmtx.metrics
                glyph_widths = []
                for key, values in glyphs.items():
                    if key[:5] == 'glyph':
                        glyph_widths.append(NumberObject(round(1000.0 * values[0] / font_upm)))

                font[NameObject('/W')] = ArrayObject([NumberObject(1), ArrayObject(glyph_widths)])
                stream.close()
        else:
            _logger.warning('The fonttools package is not installed. Generated PDF may not be PDF/A compliant.')

        outlines = self._root_object['/Outlines'].getObject()
        outlines[NameObject('/Count')] = NumberObject(1)

        # Set odoo as producer
        self.addMetadata({
            '/Creator': "Odoo",
            '/Producer': "Odoo",
        })
        self.is_pdfa = True

예제 #28

0

파일 보기

def test_DictionaryObject_read_from_stream_stream_no_newline():
    stream = BytesIO(b"<< /S /GoTo >>stream")
    pdf = None
    with pytest.raises(PdfReadError) as exc:
        DictionaryObject.read_from_stream(stream, pdf)
    assert exc.value.args[0] == "Stream data must be followed by a newline"

예제 #29

0

파일 보기

    def _update_metadata_add_attachment(self, pdf_metadata, output_intents):
        '''This method is inspired from the code of the addAttachment()
        method of the PyPDF2 lib'''
        
        # The entry for the file
        facturx_xml_str = self.factx.xml_str
        md5sum = hashlib.md5().hexdigest()
        md5sum_obj = createStringObject(md5sum)
        params_dict = DictionaryObject({
            NameObject('/CheckSum'): md5sum_obj,
            NameObject('/ModDate'): createStringObject(_get_pdf_timestamp()),
            NameObject('/Size'): NameObject(str(len(facturx_xml_str))),
            })
        file_entry = DecodedStreamObject()
        file_entry.setData(facturx_xml_str)  # here we integrate the file itself
        file_entry.update({
            NameObject("/Type"): NameObject("/EmbeddedFile"),
            NameObject("/Params"): params_dict,
            # 2F is '/' in hexadecimal
            NameObject("/Subtype"): NameObject("/text#2Fxml"),
            })
        file_entry_obj = self._addObject(file_entry)
        # The Filespec entry
        ef_dict = DictionaryObject({
            NameObject("/F"): file_entry_obj,
            NameObject('/UF'): file_entry_obj,
            })

        xmp_filename = self.factx.flavor.details['xmp_filename']
        fname_obj = createStringObject(xmp_filename)
        filespec_dict = DictionaryObject({
            NameObject("/AFRelationship"): NameObject("/Data"),
            NameObject("/Desc"): createStringObject("Factur-X Invoice"),
            NameObject("/Type"): NameObject("/Filespec"),
            NameObject("/F"): fname_obj,
            NameObject("/EF"): ef_dict,
            NameObject("/UF"): fname_obj,
            })
        filespec_obj = self._addObject(filespec_dict)
        name_arrayobj_cdict = {fname_obj: filespec_obj}
        
        # TODO: add back additional attachments?
        logger.debug('name_arrayobj_cdict=%s', name_arrayobj_cdict)
        name_arrayobj_content_sort = list(
            sorted(name_arrayobj_cdict.items(), key=lambda x: x[0]))
        logger.debug('name_arrayobj_content_sort=%s', name_arrayobj_content_sort)
        name_arrayobj_content_final = []
        af_list = []
        for (fname_obj, filespec_obj) in name_arrayobj_content_sort:
            name_arrayobj_content_final += [fname_obj, filespec_obj]
            af_list.append(filespec_obj)
        embedded_files_names_dict = DictionaryObject({
            NameObject("/Names"): ArrayObject(name_arrayobj_content_final),
            })
        
        # Then create the entry for the root, as it needs a
        # reference to the Filespec
        embedded_files_dict = DictionaryObject({
            NameObject("/EmbeddedFiles"): embedded_files_names_dict,
            })
        res_output_intents = []
        logger.debug('output_intents=%s', output_intents)
        for output_intent_dict, dest_output_profile_dict in output_intents:
            dest_output_profile_obj = self._addObject(
                dest_output_profile_dict)
            # TODO detect if there are no other objects in output_intent_dest_obj
            # than /DestOutputProfile
            output_intent_dict.update({
                NameObject("/DestOutputProfile"): dest_output_profile_obj,
                })
            output_intent_obj = self._addObject(output_intent_dict)
            res_output_intents.append(output_intent_obj)
        
        # Update the root
        xmp_level_str = self.factx.flavor.details['levels'][self.factx.flavor.level]['xmp_str']
        xmp_template = self.factx.flavor.get_xmp_xml()
        metadata_xml_str = _prepare_pdf_metadata_xml(xmp_level_str, xmp_filename, xmp_template, pdf_metadata)
        metadata_file_entry = DecodedStreamObject()
        metadata_file_entry.setData(metadata_xml_str)
        metadata_file_entry.update({
            NameObject('/Subtype'): NameObject('/XML'),
            NameObject('/Type'): NameObject('/Metadata'),
            })
        metadata_obj = self._addObject(metadata_file_entry)
        af_value_obj = self._addObject(ArrayObject(af_list))
        self._root_object.update({
            NameObject("/AF"): af_value_obj,
            NameObject("/Metadata"): metadata_obj,
            NameObject("/Names"): embedded_files_dict,
            # show attachments when opening PDF
            NameObject("/PageMode"): NameObject("/UseAttachments"),
            })
        logger.debug('res_output_intents=%s', res_output_intents)
        if res_output_intents:
            self._root_object.update({
                NameObject("/OutputIntents"): ArrayObject(res_output_intents),
            })
        metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata)
        self.addMetadata(metadata_txt_dict)

예제 #30

0

파일 보기

파일: pdf.py 프로젝트: olf42/python-drafthorse

def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level,
                                            output_intents):
    md5sum = hashlib.md5(facturx_xml_str).hexdigest()
    md5sum_obj = createStringObject(md5sum)
    params_dict = DictionaryObject({
        NameObject('/CheckSum'): md5sum_obj,
        NameObject('/ModDate'): createStringObject(datetime.datetime.now().isoformat()),
        NameObject('/Size'): NameObject(str(len(facturx_xml_str))),
    })
    file_entry = DecodedStreamObject()
    file_entry.setData(facturx_xml_str)  # here we integrate the file itself
    file_entry.update({
        NameObject("/Type"): NameObject("/EmbeddedFile"),
        NameObject("/Params"): params_dict,
        # 2F is '/' in hexadecimal
        NameObject("/Subtype"): NameObject("/text#2Fxml"),
    })
    file_entry_obj = pdf_filestream._addObject(file_entry)
    # The Filespec entry
    ef_dict = DictionaryObject({
        NameObject("/F"): file_entry_obj,
        NameObject('/UF'): file_entry_obj,
    })

    fname_obj = createStringObject("ZUGFeRD-invoice.xml")
    filespec_dict = DictionaryObject({
        NameObject("/AFRelationship"): NameObject("/Data"),
        NameObject("/Desc"): createStringObject("Factur-X Invoice"),
        NameObject("/Type"): NameObject("/Filespec"),
        NameObject("/F"): fname_obj,
        NameObject("/EF"): ef_dict,
        NameObject("/UF"): fname_obj,
    })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
    name_arrayobj_cdict = {fname_obj: filespec_obj}
    name_arrayobj_content_sort = list(
        sorted(name_arrayobj_cdict.items(), key=lambda x: x[0]))
    name_arrayobj_content_final = []
    af_list = []
    for (fname_obj, filespec_obj) in name_arrayobj_content_sort:
        name_arrayobj_content_final += [fname_obj, filespec_obj]
        af_list.append(filespec_obj)
    embedded_files_names_dict = DictionaryObject({
        NameObject("/Names"): ArrayObject(name_arrayobj_content_final),
    })
    # Then create the entry for the root, as it needs a
    # reference to the Filespec
    embedded_files_dict = DictionaryObject({
        NameObject("/EmbeddedFiles"): embedded_files_names_dict,
    })
    res_output_intents = []
    for output_intent_dict, dest_output_profile_dict in output_intents:
        dest_output_profile_obj = pdf_filestream._addObject(
            dest_output_profile_dict)
        # TODO detect if there are no other objects in output_intent_dest_obj
        # than /DestOutputProfile
        output_intent_dict.update({
            NameObject("/DestOutputProfile"): dest_output_profile_obj,
        })
        output_intent_obj = pdf_filestream._addObject(output_intent_dict)
        res_output_intents.append(output_intent_obj)
    # Update the root
    metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata)
    metadata_file_entry = DecodedStreamObject()
    metadata_file_entry.setData(metadata_xml_str)
    metadata_file_entry.update({
        NameObject('/Subtype'): NameObject('/XML'),
        NameObject('/Type'): NameObject('/Metadata'),
    })
    metadata_obj = pdf_filestream._addObject(metadata_file_entry)
    af_value_obj = pdf_filestream._addObject(ArrayObject(af_list))
    pdf_filestream._root_object.update({
        NameObject("/AF"): af_value_obj,
        NameObject("/Metadata"): metadata_obj,
        NameObject("/Names"): embedded_files_dict,
        # show attachments when opening PDF
        NameObject("/PageMode"): NameObject("/UseAttachments"),
    })
    if res_output_intents:
        pdf_filestream._root_object.update({
            NameObject("/OutputIntents"): ArrayObject(res_output_intents),
        })
    metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata)
    pdf_filestream.addMetadata(metadata_txt_dict)

예제 #31

0

파일 보기

파일: downloader.py 프로젝트: Ernstsen/PearsonEbookDownloader

def main():

    print("Loading metadata and eText information...")

    with open("bookinfo.json", 'r') as bookInfoRequest:
        str_response = bookInfoRequest.read()
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    with open("pageinfo.json", 'r') as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read())
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    with open("pages.json", 'r') as file:
        downloadedData = json.loads(file.read())[0]["pdfPlayerPageInfoTOList"]

    def get_data(page_id):
        b = next((x['data'] for x in downloadedData if x['pageID'] == page_id), None)
        return bytearray(base64.standard_b64decode(b[len("data:application/pdf;base64,"):]))

    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdf_page_label_table = {}

        # urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf"))
        with open(os.path.join(pdfDownloadDir, "0000 - cover.pdf"), 'w+b') as ous:
            ous.write(get_data(pageInfo[0]['pageID']))

        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(pdfDownloadDir,
                                    "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber']))
            with open(savePath, 'w+b') as out:
                out.write(get_data(pdfPage['pageID']))
            # urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath)

        threadPool = ThreadPool(40)  # 40 threads should download a book fairly quickly
        print("Reading pages from pageinfo.json to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            page = PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)
            os.remove(os.path.join(pdfDownloadDir, pdfFile))  # Save on memory a bit
            fileMerger.addPage(page)

        bookmarksExist = True

        # TODO: Bookmarks currently not supported
        with open("bookmarks.json", 'r') as bookmarkInfoRequest:
            try:
                bookmarkInfo = json.loads(bookmarkInfoRequest.read())
                bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]
            except Exception as e:
                bookmarksExist = False

        def recursiveSetBookmarks(aDict, parent=None):
            if isinstance(aDict, dict):
                aDict = [aDict]
            for bookmark in aDict:
                # These are the main bookmarks under this parent (or the whole document if parent is None)
                bookmarkName = bookmark['name']  # Name of the section
                pageNum = str(bookmark['linkvalue']['content'])  # First page (in the pdf's format)

                latestBookmark = fileMerger.addBookmark(bookmarkName, pdfPageTable[pageNum], parent)

                if 'basketentry' in bookmark:
                    recursiveSetBookmarks(bookmark['basketentry'], latestBookmark)

        if bookmarksExist:
            print("Adding bookmarks...")
            fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning
            recursiveSetBookmarks(bookmarkInfo['document'][0]['basketcollection']['basket']['basketentry'])
        else:
            print("Bookmarks don't exist for book")
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdf_page_label_table = [(v, k) for k, v in pdfPageTable.items()]
        pdf_page_label_table = sorted(pdf_page_label_table, key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        last_mode = None
        last_prefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdf_page_label_table:
            curr_mode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                curr_mode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                curr_mode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if curr_mode != last_mode or prefix != last_prefix:
                if prefix:
                    style.update({
                        NameObject("/P"): NameObject("({})".format(prefix))
                    })
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                last_mode = curr_mode
                last_prefix = prefix
        root_obj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        # fileMerger._addObject(pageLabels)
        pageLabels.update({
            NameObject("/Nums"): ArrayObject(labels)
        })
        root_obj.update({
            NameObject("/PageLabels"): pageLabels
        })

        print("Writing PDF...")
        with open("{}.pdf".format(bookInfo['title']).replace("/", "").replace(":", "_"), "wb") as outFile:
            fileMerger.write(outFile)

예제 #32

0

파일 보기

def main(bookId):
    if bookId.startswith("http"):
        print("Trying to extract bookId from url")
        bookData = urllib.parse.parse_qs(bookId.split("?")[-1])
        if (bookData.get("values", None)) is not None:
            bookData = {
                itemName: [itemValue]
                for itemName, itemValue in zip(
                    *[iter(bookData["values"][0].split("::"))] * 2)
            }
            # Fix capitalization
            bookData["bookid"] = bookData["bookID"]
        bookId = bookData["bookid"][0]

    bookId = int(bookId)
    print(
        "Downloading book id {}. Please open an issue on GitHub if this book id is incorrect."
        .format(bookId))

    print("Downloading metadata and eText information...")

    bookInfoGetUrl = bookInfoUrl.format(bookId)
    #print(hsidUrl(bookInfoGetUrl))
    with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest:
        str_response = bookInfoRequest.read().decode('utf-8')
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    pageInfoGetUrl = pageInfoUrl.format(
        userroleid=roletypeid,
        bookid=bookId,
        bookeditionid=bookInfo['bookEditionID'])
    with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read().decode('utf-8'))
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    def getPageUrl(pdfPage, isCover="N"):
        pdfPage = pdfPage.replace("/assets/", "")
        getPage = pagePath = pdfUrl.format(bookid=bookInfo['globalBookID'],
                                           pdfpage=pdfPage,
                                           iscover=isCover)
        return hsidUrl(getPage)

    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdfPageLabelTable = {}

        urllib.request.urlretrieve(
            getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"),
            os.path.join(pdfDownloadDir, "0000 - cover.pdf"))

        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(
                pdfDownloadDir,
                "{:04} - {}.pdf".format(pdfPage['pageOrder'],
                                        pdfPage['bookPageNumber']))
            urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']),
                                       savePath)

        threadPool = ThreadPool(
            40)  # 40 threads should download a book fairly quickly
        print("Downloading pages to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            fileMerger.addPage(
                PdfFileReader(os.path.join(pdfDownloadDir,
                                           pdfFile)).getPage(0))

        # And then add all the bookmarks to the final PDF
        bookmarkInfoGetUrl = bookmarkInfoUrl.format(
            userroleid=roletypeid,
            bookid=bookId,
            language=language,
            bookeditionid=bookInfo['bookEditionID'],
            scenarioid=1001)

        bookmarksExist = True

        with urllib.request.urlopen(
                hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest:
            try:
                bookmarkInfo = json.loads(
                    bookmarkInfoRequest.read().decode('utf-8'))
                bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]
            except Exception as e:
                bookmarksExist = False

        def recursiveSetBookmarks(aDict, parent=None):
            if isinstance(aDict, dict):
                aDict = [aDict]
            for bookmark in aDict:
                # These are the main bookmarks under this parent (or the whole document if parent is None)
                bookmarkName = bookmark['n']  # Name of the section
                pageNum = str(bookmark['lv']
                              ['content'])  # First page (in the pdf's format)

                latestBookmark = fileMerger.addBookmark(
                    bookmarkName, pdfPageTable[pageNum], parent)

                if 'be' in bookmark:
                    recursiveSetBookmarks(bookmark['be'], latestBookmark)

        if bookmarksExist:
            print("Adding bookmarks...")
            fileMerger.addBookmark(
                "Cover", 0)  # Add a bookmark to the cover at the beginning
            recursiveSetBookmarks(bookmarkInfo['document'][0]['bc']['b']['be'])
        else:
            print("Bookmarks don't exist for ID {}".format(bookId))
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdfPageLabelTable = [(v, k) for k, v in pdfPageTable.items()]
        pdfPageLabelTable = sorted(pdfPageLabelTable,
                                   key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0),
            DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        lastMode = None
        lastPrefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdfPageLabelTable:
            currMode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                currMode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                currMode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if currMode != lastMode or prefix != lastPrefix:
                if prefix:
                    style.update(
                        {NameObject("/P"): NameObject("({})".format(prefix))})
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                lastMode = currMode
                lastPrefix = prefix
        rootObj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        #fileMerger._addObject(pageLabels)
        pageLabels.update({NameObject("/Nums"): ArrayObject(labels)})
        rootObj.update({NameObject("/PageLabels"): pageLabels})

        print("Writing PDF...")
        with open(
                "{} - {}.pdf".format(bookId, bookInfo['title']).replace(
                    "/", "").replace(":", "_"), "wb") as outFile:
            fileMerger.write(outFile)