Exemple #1
0
def extract_text_objects(page):
    """Yields a sequence of TextStringObject instances from a given PageObject,
    in whatever order the internal content stream chooses to emit them.
    
    Note that the order may change as the PyPDF2 package evolves.
    
    Adapted directly from the extractText method of the PageObject class
    from PyPDF2.pdf."""
    content = page["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, page.pdf)
        for operands, operator in content.operations:
            if operator == b_("Tj"):
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    yield _text
            elif operator == b_("T*"):
                yield "\n"
            elif operator == b_("'"):
                yield "\n"
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    yield _text
            elif operator == b_('"'):
                _text = operands[2]
                if isinstance(_text, TextStringObject):
                    yield "\n"
                    yield _text
            elif operator == b_("TJ"):
                for x in operands[0]:
                    if isinstance(x, TextStringObject):
                        yield x
                yield "\n"
def convert_page_to_text(page):
    '''
    This function will copied from PyPDF2 extractText method. 
    '''
    text = u_("")
    content = page.getContents()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, page.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text + ' '
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0] + ' '
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text + ' '
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i + ' '
            text += "\n"

    return text
Exemple #3
0
def getTextByPage(self):
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text.replace("\n\n", "\n")
def extract_text_objects(page):
    """Yields a sequence of TextStringObject instances from a given PageObject,
    in whatever order the internal content stream chooses to emit them.
    
    Note that the order may change as the PyPDF2 package evolves.
    
    Adapted directly from the extractText method of the PageObject class
    from PyPDF2.pdf."""
    content = page["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, page.pdf)
        for operands, operator in content.operations:
            if operator == b_("Tj"):
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    yield _text
            elif operator == b_("T*"):
                yield "\n"
            elif operator == b_("'"):
                yield "\n"
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    yield _text
            elif operator == b_('"'):
                _text = operands[2]
                if isinstance(_text, TextStringObject):
                    yield "\n"
                    yield _text
            elif operator == b_("TJ"):
                yield blockify(operands[0])
                # for x in operands[0]:
                #    if isinstance(x, TextStringObject):
                #        yield x
                yield "\n"
Exemple #5
0
 def _evaluateColor3(self, operator, operands, color):
     if operator == b_("sc") or operator == b_("rg"):
         red = int(operands[0] * 255)
         green = int(operands[1] * 255)
         blue = int(operands[2] * 255)
         if red == color.red and green == color.green and blue == color.blue:
             return True
     return False
Exemple #6
0
 def process_content(self):
     for page_num in range(self.reader.getNumPages()):
         page = self.reader.getPage(page_num)
         content_object = page["/Contents"].getObject()
         content = ContentStream(content_object, self.reader)
         for operands, operator in content.operations:
             if operator == b_("TJ") or operator == b_("Tj"):
                 text = operands[0]
                 if any_match(text, self.remove_list):
                     print(text)
                     operands[0] = TextStringObject('')
         page.__setitem__(NameObject('/Contents'), content)
         self.writer.addPage(page)
Exemple #7
0
 def _evaluateColor4(self, operator, operands, color):
     if operator == b_("sc") or operator == b_("k"):
         cyan = operands[0]
         magenta = operands[1]
         yellow = operands[2]
         black = operands[3]
         (red, green, blue) = cmykToRGB(cyan, magenta, yellow, black)
         red = int(red * 255)
         green = int(green * 255)
         blue = int(blue * 255)
         if red == color.red and green == color.green and blue == color.blue:
             return True
     return False
Exemple #8
0
    def search(self):
        from PyPDF2.pdf import ContentStream

        for num in range(self.pdf.getNumPages()):
            page = self.pdf.getPage(num)
            self.converter.process_fonts(num, page)

            content = page["/Contents"].getObject()
            if not isinstance(content, ContentStream):
                content = ContentStream(content, page)

            last_font = None
            last_x = None
            last_y = None
            re = None
            # re = rectangle

            for operands, operator in content.operations:
                text = u_("")
                if operator == b_("re"):
                    re = operands
                elif operator == b_("Tf"):
                    last_font = operands[0]
                elif operator == b_("Tj") or operator == b_("TJ"):
                    text += self.converter.process_text_objects(
                        operands, last_font)
                elif operator == b_("T*"):
                    text += "\n"
                elif operator == b_("'"):
                    text += "\n"
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += operands[0]
                elif operator == b_('"'):
                    _text = operands[2]
                    if isinstance(_text, TextStringObject):
                        text += "\n"
                        text += _text
                elif operator == b_("Td"):
                    # text coordinates
                    last_x, last_y = operands
                elif operator == b_("cm"):
                    # text coordinates
                    *_, last_x, last_y = operands

                if text:
                    # print(text)
                    self.tables.process(re, text, last_x, last_y)
                    # re = None

        CashObject().clean()
        return self.tables.get_tables()
Exemple #9
0
 def _swapColorCmd(self, content, index, toColor):
     redObj = FloatObject((toColor.red / 255.0))
     greenObj = FloatObject((toColor.green / 255.0))
     blueObj = FloatObject((toColor.blue / 255.0))
     operator = b_("rg")
     obj = ([redObj, greenObj, blueObj], operator)
     content.operations.pop(index)
     content.operations.insert(index, obj)
     return
Exemple #10
0
    def build_pdf_preview(
        self,
        file_path: str,
        preview_name: str,
        cache_path: str,
        extension: str = ".pdf",
        page_id: int = -1,
        mimetype: str = "",
    ) -> None:

        intermediate_pdf_filename = preview_name.split("-page")[0] + ".pdf"
        intermediate_pdf_file_path = os.path.join(cache_path,
                                                  intermediate_pdf_filename)

        if not os.path.exists(intermediate_pdf_file_path):
            if os.path.exists(intermediate_pdf_file_path + "_flag"):
                # Wait 2 seconds, then retry
                # Info - B.L - 2018/09/28 - Protection for concurent file access
                # If two person try to preview the same file one will override the file
                # while the other is reading it.
                time.sleep(2)
                return self.build_pdf_preview(
                    file_path=file_path,
                    preview_name=preview_name,
                    cache_path=cache_path,
                    extension=extension,
                    page_id=page_id,
                    mimetype=mimetype,
                )

            with open(file_path, "rb") as input_stream:
                input_extension = os.path.splitext(file_path)[1]
                # first step is to convert full document to full pdf
                self._convert_to_pdf(
                    file_content=input_stream,
                    input_extension=input_extension,
                    cache_path=cache_path,
                    output_filepath=intermediate_pdf_file_path,
                    mimetype=mimetype,
                )

        if page_id < 0:
            return  # in this case, the intermediate file is the requested one

        pdf_out = PdfFileWriter()
        with open(intermediate_pdf_file_path, "rb") as pdf_stream:
            # HACK - G.M - 2020-08-19 - Transform stream in a way pypdf2 can handle it
            # this should be removed with a future pdf builder.
            stream = BytesIO(b_(pdf_stream.read()))
            pdf_in = utils.get_decrypted_pdf(stream)
            output_file_path = os.path.join(
                cache_path, "{}{}".format(preview_name, extension))
            pdf_out.addPage(pdf_in.getPage(page_id))

        with open(output_file_path, "wb") as output_file:
            pdf_out.write(output_file)
Exemple #11
0
def extractText_PageObject(self):
    """
    Locate all text drawing commands, in the order they are provided in the 
    content stream, and extract the text.  This works well for some PDF 
    files, but poorly for others, depending on the generator used.  This will 
    be refined in the future.  Do not rely on the order of text coming out of 
    this function, as it will change if this function is made more 
    sophisticated.

    :return: a unicode string object.
    """
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text
Exemple #12
0
    def extract_text_blocks(self) -> list:
        """
        Every text block in pdf begins with BT (Begin text) and ends with ET (end text)
        Get what is in between and return it.
        :return:
        """
        text = []
        content = self.pageObject["/Contents"].getObject()
        if not isinstance(content, ContentStream):
            content = ContentStream(content, self.pageObject.pdf)
        start, end = 0, 0
        for index, (operands, operator) in enumerate(content.operations):
            if operator == utils.b_("BT"):
                start = index
            if operator == utils.b_("ET"):
                end = index
            if start != 0 and end != 0:
                text.append(content.operations[start + 1:end])
                start, end = 0, 0

        return text
Exemple #13
0
    def process_content_object(self, objects):
        from PyPDF2.pdf import ContentStream
        content = ContentStream(objects, self.finder)

        last_id = None
        last_font = None

        if content is not None:
            for operands, operator in content.operations:
                text = u_("")
                curr_id = self.get_id(operands)
                if curr_id is not None:
                    last_id = curr_id
                elif operator == b_("Tf"):
                    last_font = operands[0]
                elif operator == b_("Tj") or operator == b_("TJ"):
                    text += self.converter.process_text_objects(
                        operands, last_font)
                elif operator == b_("T*"):
                    text += "\n"
                elif operator == b_("'"):
                    text += "\n"
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += operands[0]
                elif operator == b_('"'):
                    _text = operands[2]
                    if isinstance(_text, TextStringObject):
                        text += "\n"
                        text += _text

                if last_id is not None and text:
                    self.table[last_id] += text

            self.strip_table_spaces()
Exemple #14
0
def _xobj_to_image(x_object_obj):
    """
    Users need to have the pillow package installed.

    It's unclear if PyPDF2 will keep this function here, hence it's private.
    It might get removed at any point.

    :return: Tuple[file extension, bytes]
    """
    import io

    from PIL import Image

    from PyPDF2.constants import GraphicsStateParameters as G

    size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
    data = x_object_obj.getData()
    if x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB:
        mode = "RGB"
    else:
        mode = "P"
    extension = None
    if SA.FILTER in x_object_obj:
        if x_object_obj[SA.FILTER] == FT.FLATE_DECODE:
            extension = ".png"
            img = Image.frombytes(mode, size, data)
            if G.S_MASK in x_object_obj:  # add alpha channel
                alpha = Image.frombytes("L", size,
                                        x_object_obj[G.S_MASK].getData())
                img.putalpha(alpha)
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format="PNG")
            data = img_byte_arr.getvalue()
        elif x_object_obj[SA.FILTER] in ([FT.LZW_DECODE], [FT.ASCII_85_DECODE],
                                         [FT.CCITT_FAX_DECODE]):
            from PyPDF2.utils import b_
            extension = ".png"
            data = b_(data)
        elif x_object_obj[SA.FILTER] == FT.DCT_DECODE:
            extension = ".jpg"
        elif x_object_obj[SA.FILTER] == "/JPXDecode":
            extension = ".jp2"
        elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE:
            extension = ".tiff"
    else:
        extension = ".png"
        img = Image.frombytes(mode, size, data)
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format="PNG")
        data = img_byte_arr.getvalue()

    return extension, data
Exemple #15
0
 def remove_text_from_normal_page(self, pg, pdf):
     content_object = pg["/Contents"].getObject()
     content = ContentStream(content_object, pdf)
     flag = False
     for operands, operator in content.operations:
         if operator in [b_('TJ'), b_('Tj')]:
             if type(operands[0]) is list:
                 text = ''.join(
                     map(
                         lambda x: x
                         if isinstance(x, TextStringObject) else '',
                         operands[0]))
             else:
                 text = operands[0]
             if isinstance(text, TextStringObject) and text.startswith(
                     self.wmtext):
                 operands[0] = TextStringObject('')
                 flag = True
     pg[NameObject('/Contents')] = content
     if not flag and self.form:
         pg = self.remove_form_from_normal_page(pg)
     return pg
Exemple #16
0
 def __extract(self, page: list) -> str:
     """
    Locate all text drawing commands, in the order they are provided in the
    content stream, and extract the text.  This works well for some PDF
    files, but poorly for others, depending on the generator used.  This will
    be refined in the future.  Do not rely on the order of text coming out of
    this function, as it will change if this function is made more
    sophisticated.
    operators (Tj, TJ etc) can be found in page 196 of PDF Reference
    :return: a unicode string object.
    """
     text = utils.u_("")
     # Note: we check all strings are TextStringObjects.  ByteStringObjects
     # are strings where the byte->string encoding was unknown, so adding
     # them to the text here would be gibberish.
     for operands, operator in page:
         if operator == utils.b_("Tj"):
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += _text
         elif operator == utils.b_("T*"):
             text += "\n"
         elif operator == utils.b_("'"):
             text += "\n"
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += operands[0]
         elif operator == utils.b_('"'):
             _text = operands[2]
             if isinstance(_text, TextStringObject):
                 text += "\n"
                 text += _text
         elif operator == utils.b_("TJ"):
             for i in operands[0]:
                 if isinstance(i, TextStringObject):
                     text += i
             text += "\n"
     return text
Exemple #17
0
def extractText_alt_PageObject(self, Tj_sep="\n"):
    """
    Try new-lines...

    :return: a unicode string object.
    """
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += Tj_sep
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text
 def removeWatermark(self):
     #for pageNum in range(self.pdfObj.getNumPages()):
     print(self.pdfObj.getDocumentInfo())
     for pageNum in range(399, 400):
         page = self.pdfObj.getPage(pageNum)
         print(page.extractText().encode('latin-1'))
         contentObj = page["/Contents"].getObject()
         content = ContentStream(contentObj, self.pdfObj)
         for opr, opt in content.operations:
             if opt == b_("TJ"):
                 txt = opr[0][0]
                 if isinstance(txt, TextStringObject):
                     pass
         page.__setitem__(NameObject('/Contents'), content)
         print("\n\n")
Exemple #19
0
    def swapColor(self, pageIndex, fromColor, toColor):
        """
        Substitutes all the color switching operators with fromColor with toColor.
        :param pageIndex: index of evaluated page
        :param fromColor: color which will be substituted
        :param toColor: destination color
        :return:
        """
        if pageIndex >= self.getNumPages():
            print("That page doesn't exist")
            return

        print("Evaluating page no. %d..." % pageIndex)
        page = self.getPage(pageIndex)
        content = page["/Contents"].getObject()
        if not isinstance(content, ContentStream):
            content = ContentStream(content, page.pdf)
        swap_counter = 0
        for index, val in enumerate(content.operations):
            operands = val[0]
            operator = val[1]

            should_swap = False
            if operator == b_("cs"):
                self.printDebug("Nonstroking color space.")
            elif operator in self.operators:
                if len(operands) == 3:
                    should_swap = self._evaluateColor3(operator, operands,
                                                       fromColor)
                elif len(operands) == 1:
                    should_swap = self._evaluateColor1(operator, operands,
                                                       fromColor)
                elif len(operands) == 4:
                    should_swap = self._evaluateColor4(operator, operands,
                                                       fromColor)
                # evaluating should swap
                if should_swap:
                    swap_counter += 1
                    if self._removeCSRef(content, index):
                        self._swapColorCmd(content, index - 1, toColor)
                    else:
                        self._swapColorCmd(content, index, toColor)

        key = NameObject("/Contents")
        page[key] = content
        page.compressContentStreams()
        print("Replaced %d references of given color.\n" % swap_counter)
Exemple #20
0
def get_pdf_content(path):
    # Load PDF into pyPDF
    pdf, stdout, stderr = capture(PdfFileReader, open(path, "rb"))
    if len(stderr) > 0:
        logger.warning(stderr[:-1])

    # The contents will end up in this list
    lines = []

    # This vodoo code is taken from the extractText method of the PageObject.
    content = pdf.getPage(0)["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, pdf)

    for operands, operator in content.operations:
        if operator == b_("Tj"):
            text = operands[0]
            if isinstance(text, TextStringObject):
                lines.append(text)
    return lines
Exemple #21
0
    def __filter_blocks(self, blocks: list, options: list,
                        included: bool) -> list:
        """
        Filters blocks of text against a list of options and returns a string
        e.g. a bold font that matches.
        :param blocks: the list of text with its formatting options
        :param options: the options to compare against
        :param included: if we want to include or exclude the text.
        :return:
        """
        desired_blocks = []
        for block in blocks:
            matches = False
            for operands, operator in block:
                if operator == utils.b_("Tf"):
                    if included:
                        matches = operands[0] in options
                    if not included:
                        matches = operands[0] not in options
                    break
            if matches:
                desired_blocks.append(self.__extract(block))

        return desired_blocks
Exemple #22
0
def generate_facturx_from_file(
        pdf_invoice, facturx_xml, facturx_level='autodetect',
        check_xsd=True, pdf_metadata=None, output_pdf_file=None,
        additional_attachments=None):
    """
    Generate a Factur-X invoice from a regular PDF invoice and a factur-X XML
    file. The method uses a file as input (regular PDF invoice) and re-writes
    the file (Factur-X PDF invoice).
    :param pdf_invoice: the regular PDF invoice as file path
    (type string) or as file object
    :type pdf_invoice: string or file
    :param facturx_xml: the Factur-X XML
    :type facturx_xml: string, file or etree object
    :param facturx_level: the level of the Factur-X XML file. Default value
    is 'autodetect'. The only advantage to specifiy a particular value instead
    of using the autodetection is for a very very small perf improvement.
    Possible values: minimum, basicwl, basic, en16931.
    :type facturx_level: string
    :param check_xsd: if enable, checks the Factur-X XML file against the XSD
    (XML Schema Definition). If this step has already been performed
    beforehand, you should disable this feature to avoid a double check
    and get a small performance improvement.
    :type check_xsd: boolean
    :param pdf_metadata: Specify the metadata of the generated Factur-X PDF.
    If pdf_metadata is None (default value), this lib will generate some
    metadata in English by extracting relevant info from the Factur-X XML.
    Here is an example for the pdf_metadata argument:
    pdf_metadata = {
        'author': 'Akretion',
        'keywords': 'Factur-X, Invoice',
        'title': 'Akretion: Invoice I1242',
        'subject':
          'Factur-X invoice I1242 dated 2017-08-17 issued by Akretion',
        }
    If you pass the pdf_metadata argument, you will not use the automatic
    generation based on the extraction of the Factur-X XML file, which will
    bring a very small perf improvement.
    :type pdf_metadata: dict
    :param output_pdf_file: File Path to the output Factur-X PDF file
    :type output_pdf_file: string or unicode
    :param additional_attachments: Specify the other files that you want to
    embed in the PDF file. It is a dict where keys are filepath and value
    is the description of the file (as unicode or string).
    :type additional_attachments: dict
    :return: Returns True. This method re-writes the input PDF invoice file,
    unless if the output_pdf_file is provided.
    :rtype: bool
    """
    start_chrono = datetime.now()
    logger.debug('1st arg pdf_invoice type=%s', type(pdf_invoice))
    logger.debug('2nd arg facturx_xml type=%s', type(facturx_xml))
    logger.debug('optional arg facturx_level=%s', facturx_level)
    logger.debug('optional arg check_xsd=%s', check_xsd)
    logger.debug('optional arg pdf_metadata=%s', pdf_metadata)
    logger.debug(
        'optional arg additional_attachments=%s', additional_attachments)
    if not pdf_invoice:
        raise ValueError('Missing pdf_invoice argument')
    if not facturx_xml:
        raise ValueError('Missing facturx_xml argument')
    if not isinstance(facturx_level, (str, unicode)):
        raise ValueError('Wrong facturx_level argument')
    if not isinstance(check_xsd, bool):
        raise ValueError('check_xsd argument must be a boolean')
    if not isinstance(pdf_metadata, (type(None), dict)):
        raise ValueError('pdf_metadata argument must be a dict or None')
    if not isinstance(pdf_metadata, (dict, type(None))):
        raise ValueError('pdf_metadata argument must be a dict or None')
    if not isinstance(additional_attachments, (dict, type(None))):
        raise ValueError(
            'additional_attachments argument must be a dict or None')
    if not isinstance(output_pdf_file, (type(None), str, unicode)):
        raise ValueError('output_pdf_file argument must be a string or None')
    if isinstance(pdf_invoice, (str, unicode)):
        file_type = 'path'
    else:
        file_type = 'file'
    xml_root = None
    if isinstance(facturx_xml, str):
        xml_string = facturx_xml
    elif isinstance(facturx_xml, unicode):
        xml_string = facturx_xml.encode('utf8')
    elif isinstance(facturx_xml, type(etree.Element('pouet'))):
        xml_root = facturx_xml
        xml_string = etree.tostring(
            xml_root, pretty_print=True, encoding='UTF-8',
            xml_declaration=True)
    elif isinstance(facturx_xml, file):
        facturx_xml.seek(0)
        xml_string = facturx_xml.read()
        facturx_xml.close()
    else:
        raise TypeError(
            "The second argument of the method generate_facturx must be "
            "either a string, an etree.Element() object or a file "
            "(it is a %s)." % type(facturx_xml))
    additional_attachments_read = {}
    if additional_attachments:
        for attach_filepath, attach_desc in additional_attachments.items():
            filename = os.path.basename(attach_filepath)
            mod_timestamp = os.path.getmtime(attach_filepath)
            mod_dt = datetime.fromtimestamp(mod_timestamp)
            with open(attach_filepath, 'r') as fa:
                fa.seek(0)
                additional_attachments_read[fa.read()] = {
                    'filename': filename,
                    'desc': attach_desc,
                    'mod_date': mod_dt,
                    }
                fa.close()
    if pdf_metadata is None:
        if xml_root is None:
            xml_root = etree.fromstring(xml_string)
        base_info = _extract_base_info(xml_root)
        pdf_metadata = _base_info2pdf_metadata(base_info)
    else:
        # clean-up pdf_metadata dict
        for key, value in pdf_metadata.iteritems():
            if not isinstance(value, (str, unicode)):
                pdf_metadata[key] = ''
    facturx_level = facturx_level.lower()
    if facturx_level not in FACTURX_LEVEL2xsd:
        if xml_root is None:
            xml_root = etree.fromstring(xml_string)
        logger.debug('Factur-X level will be autodetected')
        facturx_level = get_facturx_level(xml_root)
    if check_xsd:
        check_facturx_xsd(
            xml_string, flavor='factur-x', facturx_level=facturx_level)
    original_pdf = PdfFileReader(pdf_invoice)
    # Extract /OutputIntents obj from original invoice
    output_intents = _get_original_output_intents(original_pdf)
    new_pdf_filestream = PdfFileWriter()
    new_pdf_filestream._header = b_("%PDF-1.6")
    new_pdf_filestream.appendPagesFromReader(original_pdf)

    original_pdf_id = original_pdf.trailer.get('/ID')
    logger.debug('original_pdf_id=%s', original_pdf_id)
    if original_pdf_id:
        new_pdf_filestream._ID = original_pdf_id
        # else : generate some ?
    _facturx_update_metadata_add_attachment(
        new_pdf_filestream, xml_string, pdf_metadata, facturx_level,
        output_intents=output_intents,
        additional_attachments=additional_attachments_read)
    if output_pdf_file:
        with open(output_pdf_file, 'wb') as output_f:
            new_pdf_filestream.write(output_f)
            output_f.close()
    else:
        if file_type == 'path':
            with open(pdf_invoice, 'wb') as f:
                new_pdf_filestream.write(f)
                f.close()
        elif file_type == 'file':
            new_pdf_filestream.write(pdf_invoice)
    logger.info('%s file added to PDF invoice', FACTURX_FILENAME)
    end_chrono = datetime.now()
    logger.info(
        'Factur-X invoice generated in %s seconds',
        (end_chrono - start_chrono).total_seconds())
    return True
Exemple #23
0
    def removeWordStyle(self, ignoreByteStringObject=False):
        """
        Removes imported styles from Word - Path Constructors rectangles - from this output.

        :param bool ignoreByteStringObject: optional parameter
            to ignore ByteString Objects.
        """

        pages = self.getObject(self._pages)['/Kids']
        for j in range(len(pages)):
            page = pages[j]
            pageRef = self.getObject(page)
            content = pageRef['/Contents'].getObject()

            if not isinstance(content, ContentStream):
                content = ContentStream(content, pageRef)

            _operations = []
            last_font_size = 0

            for operator_index, (operands,
                                 operator) in enumerate(content.operations):

                if operator == b_('Tf') and operands[0][:2] == '/F':
                    last_font_size = operands[1].as_numeric()

                if operator == b_('Tj'):
                    text = operands[0]
                    if ignoreByteStringObject:
                        if not isinstance(text, TextStringObject):
                            operands[0] = TextStringObject()
                elif operator == b_("'"):
                    text = operands[0]
                    if ignoreByteStringObject:
                        if not isinstance(text, TextStringObject):
                            operands[0] = TextStringObject()
                elif operator == b_('"'):
                    text = operands[2]
                    if ignoreByteStringObject:
                        if not isinstance(text, TextStringObject):
                            operands[2] = TextStringObject()
                elif operator == b_("TJ"):
                    for i in range(len(operands[0])):
                        if ignoreByteStringObject:
                            if not isinstance(operands[0][i],
                                              TextStringObject):
                                operands[0][i] = TextStringObject()

                operator_type = self._getOperatorType(operator)

                # we are ignoring all grayscale colors
                # tests showed that black underlines, borders and tables are defined by grayscale and arn't using rgb/cmyk colors
                if operator_type == 'rgb' or operator_type == 'cmyk':

                    color_target_operation_type = self._getColorTargetOperationType(
                        operator_index, content.operations)

                    new_color = None

                    # we are coloring all text in black and all rectangles in white
                    # removing all colors paints rectangles in black which gives us unwanted results
                    if color_target_operation_type == 'text':
                        new_color = 'black'
                    elif color_target_operation_type == 'rectangle':
                        new_color = 'white'

                    if new_color:
                        operands = self.colors_operands[operator_type][
                            new_color]

                # remove styled rectangles (highlights, lines, etc.)
                # the 're' operator is a Path Construction operator, creates a rectangle()
                # presumably, that's the way word embedding all of it's graphics into a PDF when creating one
                if operator == b_('re'):

                    rectangle_width = operands[-2].as_numeric()
                    rectangle_height = operands[-1].as_numeric()

                    minWidth = self.getMinimumRectangleWidth(
                        last_font_size,
                        1)  # (length of X letters at the current size)
                    maxHeight = last_font_size + 6  # range to catch really big highlights
                    minHeight = 1.5  # so that thin lines will not be removed

                    # remove only style that:
                    # it's width are bigger than the minimum
                    # it's height is smaller than maximum and larger than minimum
                    if rectangle_width > minWidth and rectangle_height > minHeight and rectangle_height <= maxHeight:
                        continue

                _operations.append((operands, operator))

            content.operations = _operations
            pageRef.__setitem__(NameObject('/Contents'), content)
Exemple #24
0
    def _getOperatorType(self, operator):
        operator_types = {
            b_('Tj'): 'text',
            b_("'"): 'text',
            b_('"'): 'text',
            b_("TJ"): 'text',
            b_('rg'): 'rgb',  # color
            b_('RG'): 'rgb',  # color
            b_('k'): 'cmyk',  # color
            b_('K'): 'cmyk',  # color
            b_('g'): 'grayscale',  # color
            b_('G'): 'grayscale',  # color
            b_('re'): 'rectangle',
            b_('l'): 'line',  # line
            b_('m'): 'line',  # start line
            b_('S'): 'line',  # stroke(paint) line
        }

        if operator in operator_types:
            return operator_types[operator]

        return None
Exemple #25
0
    return np.any(diff.max(axis=1) < epsilon)


# 从结果看,MS-Word加的水印,有些指令混在正常数据中,需要更精细调试处理
with open(with_wm_path, 'rb') as f, open(nowm_out_path, 'wb') as f_out:
    pdf = PdfFileReader(f)
    pdf_out = PdfFileWriter()
    # print(pdf.getDocumentInfo())
    cn_pages = pdf.getNumPages()
    for i in range(cn_pages):
        page = pdf.getPage(i)
        content = page.getContents()
        cs = ContentStream(content, pdf)
        for operands, operator in cs.operations:
            # `b_`只是python2/3中bytes类型转换的冗余代码
            if operator == b_('Tm') and match_location(operands, TARGET_TXT):
                operands[:] = []
            elif operator == b_('cm') and match_location(operands, TARGET_IMG):
                operands[:] = []
            elif operator == b_('gs'):
                if operands[0] == '/GS0':
                    operands[:] = []
            elif operator == b_('Do'):
                # 引用图片名称
                if operands[0] == '/Im0':
                    pass
                elif operands[0] == '/Fm0':
                    operands[:] = []
        page.__setitem__(NameObject('/Contents'), cs)
        pdf_out.addPage(page)
    pdf_out.write(f_out)
Exemple #26
0
        def extractText(self, skip_intertwined_text=True):
            """
            Locate all text drawing commands, in the order they are provided in the
            content stream, and extract the text.  This works well for some PDF
            files, but poorly for others, depending on the generator used.  This will
            be refined in the future.  Do not rely on the order of text coming out of
            this function, as it will change if this function is made more
            sophisticated.

            :return: a unicode string object.
            """
            text = u_("")
            content = self["/Contents"].getObject()
            if not isinstance(content, ContentStream):
                content = ContentStream(content, self.pdf)
            # Note: we check all strings are TextStringObjects.  ByteStringObjects
            # are strings where the byte->string encoding was unknown, so adding
            # them to the text here would be gibberish.
            #
            indent = 0
            previous_width = 0
            skip_next = False
            for operands, operator in content.operations:
                if not operands:  # Empty operands list contributes no text
                    operands = [""]
                if operator == b_("Tj"):
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += _text
                elif operator == b_("T*"):
                    text += "\n"
                elif operator == b_("'"):
                    text += "\n"
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += operands[0]
                elif operator == b_('"'):
                    _text = operands[2]
                    if isinstance(_text, TextStringObject):
                        text += "\n"
                        text += _text
                elif operator == b_("TJ"):
                    if skip_intertwined_text and skip_next:
                        skip_next = False
                    else:
                        for i in operands[0]:
                            if isinstance(i, TextStringObject):
                                text += i
                                previous_width += len(i)
                            elif isinstance(i, FloatObject) or isinstance(i, NumberObject):
                                if text and (not text[-1] in " \n"):
                                    text += " " * int(i / -100)
                                    previous_width += int(i / -100)
                elif operator == b_("Td"):
                    indent = indent + operands[0]
                    if operands[1] == 0:
                        if int(operands[0] / 20) >= previous_width:
                            text += " " * (int(operands[0] / 20) - previous_width)
                        else:
                            skip_next = True
                            # If skip_intertwined_text is false, this will result in no space between the two 'lines'
                    else:
                        previous_width = 0
                        text += "\n" * max(0, int(operands[1] / -50)) + " " * max(0, int(indent / 20))
                elif operator == b_("Tm"):
                    indent = operands[4]
                    text += " " * max(0, int(indent / 20))
                elif operator == b_("TD") or operator == b_("Tm"):
                    if text and (not text[-1] in " \n"):
                        text += " "
            return text
Exemple #27
0
 def _evaluateColor1(self, operator, operands, color):
     if operator == b_("sc") or operator == b_("g"):
         grey = int(operands[0] * 255)
         if grey == color.red and grey == color.green and grey == color.blue:
             return True
     return False
Exemple #28
0
"""

# Copyright (c) 2020 Ben Zimmer. All rights reserved.

import sys
from typing import List, Tuple, Optional

import PyPDF2 as pdf
from PyPDF2.utils import b_
from PyPDF2.pdf import ContentStream, PageObject

DEBUG = False

TEXT_POSITION_OPS = [  # page 310
    b_(x) for x in ["Td", "TD", "Tm", "T*"]
]

TEXT_SHOW_OPS = [  # page 311
    b_(x) for x in ["TJ", "'", "\"", "TJ"]
]

SPACING_EXPECTED = {-14.445, -14.446}
START_EXPECTED = {
    528.045,  # normal pages
    # 400.95        # chapter title pages
    402.143  # chapter title pages
}

FONT_WEIGHT_MAPPING = {"/F34": "regular", "/F32": "bold"}
Exemple #29
0
class PdfColorConverter(PdfFileWriter):
    operators = [b_("sc"), b_("rg"), b_("g"), b_("k")]

    def __init__(self, debug=False):
        PdfFileWriter.__init__(self)
        self.debug = debug

    def printDebug(self, str):
        if self.debug:
            print(str)
        return

    def swapColor(self, pageIndex, fromColor, toColor):
        """
        Substitutes all the color switching operators with fromColor with toColor.
        :param pageIndex: index of evaluated page
        :param fromColor: color which will be substituted
        :param toColor: destination color
        :return:
        """
        if pageIndex >= self.getNumPages():
            print("That page doesn't exist")
            return

        print("Evaluating page no. %d..." % pageIndex)
        page = self.getPage(pageIndex)
        content = page["/Contents"].getObject()
        if not isinstance(content, ContentStream):
            content = ContentStream(content, page.pdf)
        swap_counter = 0
        for index, val in enumerate(content.operations):
            operands = val[0]
            operator = val[1]

            should_swap = False
            if operator == b_("cs"):
                self.printDebug("Nonstroking color space.")
            elif operator in self.operators:
                if len(operands) == 3:
                    should_swap = self._evaluateColor3(operator, operands,
                                                       fromColor)
                elif len(operands) == 1:
                    should_swap = self._evaluateColor1(operator, operands,
                                                       fromColor)
                elif len(operands) == 4:
                    should_swap = self._evaluateColor4(operator, operands,
                                                       fromColor)
                # evaluating should swap
                if should_swap:
                    swap_counter += 1
                    if self._removeCSRef(content, index):
                        self._swapColorCmd(content, index - 1, toColor)
                    else:
                        self._swapColorCmd(content, index, toColor)

        key = NameObject("/Contents")
        page[key] = content
        page.compressContentStreams()
        print("Replaced %d references of given color.\n" % swap_counter)

    # removing operator that switches color space
    def _removeCSRef(self, content, index):
        if index >= 1 and content.operations[index - 1][1] == b_("cs"):
            content.operations.pop(index - 1)
            self.printDebug("Removing CS")
            return True
        return False

    def _swapColorCmd(self, content, index, toColor):
        redObj = FloatObject((toColor.red / 255.0))
        greenObj = FloatObject((toColor.green / 255.0))
        blueObj = FloatObject((toColor.blue / 255.0))
        operator = b_("rg")
        obj = ([redObj, greenObj, blueObj], operator)
        content.operations.pop(index)
        content.operations.insert(index, obj)
        return

    def _evaluateColor3(self, operator, operands, color):
        if operator == b_("sc") or operator == b_("rg"):
            red = int(operands[0] * 255)
            green = int(operands[1] * 255)
            blue = int(operands[2] * 255)
            if red == color.red and green == color.green and blue == color.blue:
                return True
        return False

    def _evaluateColor1(self, operator, operands, color):
        if operator == b_("sc") or operator == b_("g"):
            grey = int(operands[0] * 255)
            if grey == color.red and grey == color.green and grey == color.blue:
                return True
        return False

    def _evaluateColor4(self, operator, operands, color):
        if operator == b_("sc") or operator == b_("k"):
            cyan = operands[0]
            magenta = operands[1]
            yellow = operands[2]
            black = operands[3]
            (red, green, blue) = cmykToRGB(cyan, magenta, yellow, black)
            red = int(red * 255)
            green = int(green * 255)
            blue = int(blue * 255)
            if red == color.red and green == color.green and blue == color.blue:
                return True
        return False
Exemple #30
0
 def _removeCSRef(self, content, index):
     if index >= 1 and content.operations[index - 1][1] == b_("cs"):
         content.operations.pop(index - 1)
         self.printDebug("Removing CS")
         return True
     return False
for p in range(source.getNumPages()):
    page = source.getPage(p)
    # print(page.extractText())
    #content_object, = page["/Contents"][0].getObject()
    content_object = page["/Contents"][1]

    content = ContentStream(content_object, source)
    for operands, operator in content.operations:
        # print(operator, operands) # pdf元素的类型和值

        # 主要的代码在这里,使用各种方式找到水印可识别的特征
        # if operator == b_("TJ"): # `b_`只是python2/3中bytes类型转换的冗余代码
        #     text = operands[0][0]
        #     # if isinstance(text, bytes):
        #     #     print('====  ', text, '  ====')
        #     #     for c in guess_codes(text):
        #     #         print(c, text.decode(c))
        #     if isinstance(text, TextStringObject) and text in target_str:
        #         operands[0] = TextStringObject('')

        if operator == b_("cm") and match_location(operands, target_locations):
            operands[:] = []

    page.__setitem__(NameObject('/Contents'), content)
    output.addPage(page)

outputStream = open("tmp/output.pdf", "wb")
output.write(outputStream)
outputStream.close()

file.close()