コード例 #1
0
    def __get_font_settings(self, constant_option: str) -> list:
        """

        :param constant_option:
        :return:
        """
        font_settings = []
        for font_name in self.fonts:
            page_font_settings = self.fonts[font_name]
            if constant_option in page_font_settings[utils.u_(
                    '/BaseFont')].lower():
                font_settings.append(utils.u_(font_name))

        return font_settings
コード例 #2
0
ファイル: main.py プロジェクト: zhangsui1997/PDF
def getTextByPage(self):
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text.replace("\n\n", "\n")
コード例 #3
0
def convert_page_to_text(page):
    '''
    This function will copied from PyPDF2 extractText method. 
    '''
    text = u_("")
    content = page.getContents()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, page.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text + ' '
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0] + ' '
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text + ' '
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i + ' '
            text += "\n"

    return text
コード例 #4
0
ファイル: tables.py プロジェクト: akapitonov/PyPDF2
    def process_content_object(self, objects):
        from PyPDF2.pdf import ContentStream
        content = ContentStream(objects, self.finder)

        last_id = None
        last_font = None

        if content is not None:
            for operands, operator in content.operations:
                text = u_("")
                curr_id = self.get_id(operands)
                if curr_id is not None:
                    last_id = curr_id
                elif operator == b_("Tf"):
                    last_font = operands[0]
                elif operator == b_("Tj") or operator == b_("TJ"):
                    text += self.converter.process_text_objects(
                        operands, last_font)
                elif operator == b_("T*"):
                    text += "\n"
                elif operator == b_("'"):
                    text += "\n"
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += operands[0]
                elif operator == b_('"'):
                    _text = operands[2]
                    if isinstance(_text, TextStringObject):
                        text += "\n"
                        text += _text

                if last_id is not None and text:
                    self.table[last_id] += text

            self.strip_table_spaces()
コード例 #5
0
ファイル: tables.py プロジェクト: akapitonov/PyPDF2
 def process_text_objects(self, operands, current_font):
     text = u_("")
     if operands:
         if isinstance(operands, list):
             items = operands[0] if isinstance(operands[0],
                                               list) else operands
             for item in items:
                 text += self.process_text_object(item, current_font)
         else:
             text += self.process_text_object(operands, current_font)
     return text
コード例 #6
0
ファイル: tables.py プロジェクト: akapitonov/PyPDF2
    def search(self):
        from PyPDF2.pdf import ContentStream

        for num in range(self.pdf.getNumPages()):
            page = self.pdf.getPage(num)
            self.converter.process_fonts(num, page)

            content = page["/Contents"].getObject()
            if not isinstance(content, ContentStream):
                content = ContentStream(content, page)

            last_font = None
            last_x = None
            last_y = None
            re = None
            # re = rectangle

            for operands, operator in content.operations:
                text = u_("")
                if operator == b_("re"):
                    re = operands
                elif operator == b_("Tf"):
                    last_font = operands[0]
                elif operator == b_("Tj") or operator == b_("TJ"):
                    text += self.converter.process_text_objects(
                        operands, last_font)
                elif operator == b_("T*"):
                    text += "\n"
                elif operator == b_("'"):
                    text += "\n"
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += operands[0]
                elif operator == b_('"'):
                    _text = operands[2]
                    if isinstance(_text, TextStringObject):
                        text += "\n"
                        text += _text
                elif operator == b_("Td"):
                    # text coordinates
                    last_x, last_y = operands
                elif operator == b_("cm"):
                    # text coordinates
                    *_, last_x, last_y = operands

                if text:
                    # print(text)
                    self.tables.process(re, text, last_x, last_y)
                    # re = None

        CashObject().clean()
        return self.tables.get_tables()
コード例 #7
0
def extractText_PageObject(self):
    """
    Locate all text drawing commands, in the order they are provided in the 
    content stream, and extract the text.  This works well for some PDF 
    files, but poorly for others, depending on the generator used.  This will 
    be refined in the future.  Do not rely on the order of text coming out of 
    this function, as it will change if this function is made more 
    sophisticated.

    :return: a unicode string object.
    """
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text
コード例 #8
0
ファイル: tables.py プロジェクト: akapitonov/PyPDF2
    def process_text_object(self, item, current_font):
        def extract_text(self, item):
            item = item.hex()
            font = self.fonts.get(current_font, {})
            elem_length = len(list(font.keys())[0]) if font else 1
            items = [
                item[index:index + elem_length]
                for index in range(0, len(item), elem_length)
            ]

            return ''.join([font.get(item, '?') for item in items])

        text = u_("")
        if isinstance(item, TextStringObject):
            if self.text_to_hex:
                item = item.get_original_bytes()
                text = extract_text(self, item)
            else:
                text = item
        elif isinstance(item, ByteStringObject):
            text = extract_text(self, item)
        return text
コード例 #9
0
 def __extract(self, page: list) -> str:
     """
    Locate all text drawing commands, in the order they are provided in the
    content stream, and extract the text.  This works well for some PDF
    files, but poorly for others, depending on the generator used.  This will
    be refined in the future.  Do not rely on the order of text coming out of
    this function, as it will change if this function is made more
    sophisticated.
    operators (Tj, TJ etc) can be found in page 196 of PDF Reference
    :return: a unicode string object.
    """
     text = utils.u_("")
     # Note: we check all strings are TextStringObjects.  ByteStringObjects
     # are strings where the byte->string encoding was unknown, so adding
     # them to the text here would be gibberish.
     for operands, operator in page:
         if operator == utils.b_("Tj"):
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += _text
         elif operator == utils.b_("T*"):
             text += "\n"
         elif operator == utils.b_("'"):
             text += "\n"
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += operands[0]
         elif operator == utils.b_('"'):
             _text = operands[2]
             if isinstance(_text, TextStringObject):
                 text += "\n"
                 text += _text
         elif operator == utils.b_("TJ"):
             for i in operands[0]:
                 if isinstance(i, TextStringObject):
                     text += i
             text += "\n"
     return text
コード例 #10
0
 def customExtractText(self):
     text = u_("")
     content = self["/Contents"].getObject()
     if not isinstance(content, ContentStream):
         content = ContentStream(content, self.pdf)
     # Note: we check all strings are TextStringObjects.  ByteStringObjects
     # are strings where the byte->string encoding was unknown, so adding
     # them to the text here would be gibberish.
     for operands, operator in content.operations:
         if operator == b_("Tj"):
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += _text
         elif operator == b_("T*"):
             text += "\n"
         elif operator == b_("'"):
             text += "\n"
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += operands[0]
         elif operator == b_('"'):
             _text = operands[2]
             if isinstance(_text, TextStringObject):
                 text += "\n"
                 text += _text
         elif operator == b_("TJ"):
             for i in operands[0]:
                 if isinstance(i, TextStringObject):
                     text += i
                 elif isinstance(i, FloatObject) or isinstance(i, NumberObject):
                     if i < -100:
                         text += " "
         elif operator == b_("TD") or operator == b_("Tm"):
             if len(text) > 0 and text[-1] != " " and text[-1] != "\n":
                 text += " "
     text = text.replace(" - ", "-")
     text = re.sub("\\s+", " ", text)
     return text
コード例 #11
0
def extractText_alt_PageObject(self, Tj_sep="\n"):
    """
    Try new-lines...

    :return: a unicode string object.
    """
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += Tj_sep
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text
コード例 #12
0
    def alt_extractText(self):
        """
        Locate text and include "\n"

        :return: a unicode string object.
        """
        pic = ""
        tic = "~"
        text = u_("")
        content = self["/Contents"].getObject()
        if not isinstance(content, ContentStream):
            content = ContentStream(content, self.pdf)
        for operands, operator in content.operations:
            if operator == b_("Tj"):
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    text += _text + pic
            elif operator == b_("T*"):
                text += "\n"
            elif operator == b_("'"):
                text += "\n"
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    text += operands[0]
            elif operator == b_('"'):
                _text = operands[2]
                if isinstance(_text, TextStringObject):
                    text += "\n"
                    text += _text
            elif operator == b_("TJ"):
                for i in operands[0]:
                    if isinstance(i, TextStringObject):
                        text += i
                text += "\n"
            else:
                text += tic
        return text
コード例 #13
0
        def extractText(self, skip_intertwined_text=True):
            """
            Locate all text drawing commands, in the order they are provided in the
            content stream, and extract the text.  This works well for some PDF
            files, but poorly for others, depending on the generator used.  This will
            be refined in the future.  Do not rely on the order of text coming out of
            this function, as it will change if this function is made more
            sophisticated.

            :return: a unicode string object.
            """
            text = u_("")
            content = self["/Contents"].getObject()
            if not isinstance(content, ContentStream):
                content = ContentStream(content, self.pdf)
            # Note: we check all strings are TextStringObjects.  ByteStringObjects
            # are strings where the byte->string encoding was unknown, so adding
            # them to the text here would be gibberish.
            #
            indent = 0
            previous_width = 0
            skip_next = False
            for operands, operator in content.operations:
                if not operands:  # Empty operands list contributes no text
                    operands = [""]
                if operator == b_("Tj"):
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += _text
                elif operator == b_("T*"):
                    text += "\n"
                elif operator == b_("'"):
                    text += "\n"
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += operands[0]
                elif operator == b_('"'):
                    _text = operands[2]
                    if isinstance(_text, TextStringObject):
                        text += "\n"
                        text += _text
                elif operator == b_("TJ"):
                    if skip_intertwined_text and skip_next:
                        skip_next = False
                    else:
                        for i in operands[0]:
                            if isinstance(i, TextStringObject):
                                text += i
                                previous_width += len(i)
                            elif isinstance(i, FloatObject) or isinstance(i, NumberObject):
                                if text and (not text[-1] in " \n"):
                                    text += " " * int(i / -100)
                                    previous_width += int(i / -100)
                elif operator == b_("Td"):
                    indent = indent + operands[0]
                    if operands[1] == 0:
                        if int(operands[0] / 20) >= previous_width:
                            text += " " * (int(operands[0] / 20) - previous_width)
                        else:
                            skip_next = True
                            # If skip_intertwined_text is false, this will result in no space between the two 'lines'
                    else:
                        previous_width = 0
                        text += "\n" * max(0, int(operands[1] / -50)) + " " * max(0, int(indent / 20))
                elif operator == b_("Tm"):
                    indent = operands[4]
                    text += " " * max(0, int(indent / 20))
                elif operator == b_("TD") or operator == b_("Tm"):
                    if text and (not text[-1] in " \n"):
                        text += " "
            return text