Beispiel #1
0
    def process_content_object(self, objects):
        from PyPDF2.pdf import ContentStream
        content = ContentStream(objects, self.finder)

        last_id = None
        last_font = None

        if content is not None:
            for operands, operator in content.operations:
                text = u_("")
                curr_id = self.get_id(operands)
                if curr_id is not None:
                    last_id = curr_id
                elif operator == b_("Tf"):
                    last_font = operands[0]
                elif operator == b_("Tj") or operator == b_("TJ"):
                    text += self.converter.process_text_objects(
                        operands, last_font)
                elif operator == b_("T*"):
                    text += "\n"
                elif operator == b_("'"):
                    text += "\n"
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += operands[0]
                elif operator == b_('"'):
                    _text = operands[2]
                    if isinstance(_text, TextStringObject):
                        text += "\n"
                        text += _text

                if last_id is not None and text:
                    self.table[last_id] += text

            self.strip_table_spaces()
Beispiel #2
0
def getTextByPage(self):
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text.replace("\n\n", "\n")
Beispiel #3
0
def extract_text_objects(page):
    """Yields a sequence of TextStringObject instances from a given PageObject,
    in whatever order the internal content stream chooses to emit them.
    
    Note that the order may change as the PyPDF2 package evolves.
    
    Adapted directly from the extractText method of the PageObject class
    from PyPDF2.pdf."""
    content = page["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, page.pdf)
        for operands, operator in content.operations:
            if operator == b_("Tj"):
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    yield _text
            elif operator == b_("T*"):
                yield "\n"
            elif operator == b_("'"):
                yield "\n"
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    yield _text
            elif operator == b_('"'):
                _text = operands[2]
                if isinstance(_text, TextStringObject):
                    yield "\n"
                    yield _text
            elif operator == b_("TJ"):
                for x in operands[0]:
                    if isinstance(x, TextStringObject):
                        yield x
                yield "\n"
Beispiel #4
0
def extractTextList(self):
    text_list = []
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)

    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject) and len(_text.strip()):
                text_list.append(_text.strip())
        elif operator == b_("T*"):
            pass
        elif operator == b_("'"):
            pass
            _text = operands[0]
            if isinstance(_text, TextStringObject) and len(operands[0]):
                text_list.append(operands[0])
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject) and len(_text):
                text_list.append(_text)
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject) and len(i):
                    text_list.append(i)
    return text_list
def convert_page_to_text(page):
    '''
    This function will copied from PyPDF2 extractText method. 
    '''
    text = u_("")
    content = page.getContents()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, page.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text + ' '
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0] + ' '
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text + ' '
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i + ' '
            text += "\n"

    return text
Beispiel #6
0
 def InsertXObject(self, name):
     """
     XObject can be an image or a 'form' (an arbitrary PDF sequence).
     """
     dlist = []
     xobject = self.page["/Resources"].getObject()['/XObject']
     stream = xobject[name]
     if stream.get('/Subtype') == '/Form':
         # insert contents into current page drawing
         if not name in self.formdrawings:       # extract if not already done
             pdf_fonts = self.FetchFonts(stream)
             x_bbox = stream.get('/BBox')
             matrix = stream.get('/Matrix')
             form_ops = ContentStream(stream, self.pdfdoc).operations
             oplist = [([], 'q'), (matrix, 'cm')]    # push state & apply matrix
             oplist.extend(form_ops)                 # add form contents
             oplist.append(([], 'Q'))                # restore original state
             self.formdrawings[name] = self.ProcessOperators(oplist, pdf_fonts)
         dlist.extend(self.formdrawings[name])
     elif stream.get('/Subtype') == '/Image':
         width = stream['/Width']
         height = stream['/Height']
         x_depth = stream['/BitsPerComponent']
         filters = stream["/Filter"]
         item = self.AddBitmap(stream._data, width, height, filters)
         if item:            # may be unimplemented
             dlist.append(item)
     return dlist
 def extractOperators(self):
     ops = []
     content = self["/Contents"].getObject()
     if not isinstance(content, ContentStream):
         content = ContentStream(content, self.pdf)
     for op in content.operations:
         ops.append(op)
     return ops
Beispiel #8
0
def extractOperators(self):
    """
    Locate and return all commands in the order they
    occur in the content stream. Used by pdfviewer.
    """
    ops = []
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    for op in content.operations:
        ops.append(op)
    return ops
Beispiel #9
0
    def search(self):
        from PyPDF2.pdf import ContentStream

        for num in range(self.pdf.getNumPages()):
            page = self.pdf.getPage(num)
            self.converter.process_fonts(num, page)

            content = page["/Contents"].getObject()
            if not isinstance(content, ContentStream):
                content = ContentStream(content, page)

            last_font = None
            last_x = None
            last_y = None
            re = None
            # re = rectangle

            for operands, operator in content.operations:
                text = u_("")
                if operator == b_("re"):
                    re = operands
                elif operator == b_("Tf"):
                    last_font = operands[0]
                elif operator == b_("Tj") or operator == b_("TJ"):
                    text += self.converter.process_text_objects(
                        operands, last_font)
                elif operator == b_("T*"):
                    text += "\n"
                elif operator == b_("'"):
                    text += "\n"
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += operands[0]
                elif operator == b_('"'):
                    _text = operands[2]
                    if isinstance(_text, TextStringObject):
                        text += "\n"
                        text += _text
                elif operator == b_("Td"):
                    # text coordinates
                    last_x, last_y = operands
                elif operator == b_("cm"):
                    # text coordinates
                    *_, last_x, last_y = operands

                if text:
                    # print(text)
                    self.tables.process(re, text, last_x, last_y)
                    # re = None

        CashObject().clean()
        return self.tables.get_tables()
Beispiel #10
0
 def process_content(self):
     for page_num in range(self.reader.getNumPages()):
         page = self.reader.getPage(page_num)
         content_object = page["/Contents"].getObject()
         content = ContentStream(content_object, self.reader)
         for operands, operator in content.operations:
             if operator == b_("TJ") or operator == b_("Tj"):
                 text = operands[0]
                 if any_match(text, self.remove_list):
                     print(text)
                     operands[0] = TextStringObject('')
         page.__setitem__(NameObject('/Contents'), content)
         self.writer.addPage(page)
Beispiel #11
0
 def extractOperators(self):
     """
     Locate and return all commands in the order they
     occur in the content stream
     """
     ops = []
     content = self["/Contents"].getObject()
     if not isinstance(content, ContentStream):
         content = ContentStream(content, self.pdf)
     for op in content.operations:
         if type(op[1] == bytes):
             op = (op[0], op[1].decode())
         ops.append(op)
     return ops
Beispiel #12
0
 def removeWatermark(self):
     #for pageNum in range(self.pdfObj.getNumPages()):
     print(self.pdfObj.getDocumentInfo())
     for pageNum in range(399, 400):
         page = self.pdfObj.getPage(pageNum)
         print(page.extractText().encode('latin-1'))
         contentObj = page["/Contents"].getObject()
         content = ContentStream(contentObj, self.pdfObj)
         for opr, opt in content.operations:
             if opt == b_("TJ"):
                 txt = opr[0][0]
                 if isinstance(txt, TextStringObject):
                     pass
         page.__setitem__(NameObject('/Contents'), content)
         print("\n\n")
Beispiel #13
0
    def swapColor(self, pageIndex, fromColor, toColor):
        """
        Substitutes all the color switching operators with fromColor with toColor.
        :param pageIndex: index of evaluated page
        :param fromColor: color which will be substituted
        :param toColor: destination color
        :return:
        """
        if pageIndex >= self.getNumPages():
            print("That page doesn't exist")
            return

        print("Evaluating page no. %d..." % pageIndex)
        page = self.getPage(pageIndex)
        content = page["/Contents"].getObject()
        if not isinstance(content, ContentStream):
            content = ContentStream(content, page.pdf)
        swap_counter = 0
        for index, val in enumerate(content.operations):
            operands = val[0]
            operator = val[1]

            should_swap = False
            if operator == b_("cs"):
                self.printDebug("Nonstroking color space.")
            elif operator in self.operators:
                if len(operands) == 3:
                    should_swap = self._evaluateColor3(operator, operands,
                                                       fromColor)
                elif len(operands) == 1:
                    should_swap = self._evaluateColor1(operator, operands,
                                                       fromColor)
                elif len(operands) == 4:
                    should_swap = self._evaluateColor4(operator, operands,
                                                       fromColor)
                # evaluating should swap
                if should_swap:
                    swap_counter += 1
                    if self._removeCSRef(content, index):
                        self._swapColorCmd(content, index - 1, toColor)
                    else:
                        self._swapColorCmd(content, index, toColor)

        key = NameObject("/Contents")
        page[key] = content
        page.compressContentStreams()
        print("Replaced %d references of given color.\n" % swap_counter)
Beispiel #14
0
def extractText_patch(self):
    """
    Locate all text drawing commands, in the order they are provided in the
    content stream, and extract the text.  This works well for some PDF
    files, but poorly for others, depending on the generator used.  This will
    be refined in the future.  Do not rely on the order of text coming out of
    this function, as it will change if this function is made more
    sophisticated.

    :return: a unicode string object.
    """
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
                elif isinstance(i, NumberObject) and i < -125:
                    text += " "

            text += "\n"
    return text
Beispiel #15
0
    def extract_text_blocks(self) -> list:
        """
        Every text block in pdf begins with BT (Begin text) and ends with ET (end text)
        Get what is in between and return it.
        :return:
        """
        text = []
        content = self.pageObject["/Contents"].getObject()
        if not isinstance(content, ContentStream):
            content = ContentStream(content, self.pageObject.pdf)
        start, end = 0, 0
        for index, (operands, operator) in enumerate(content.operations):
            if operator == utils.b_("BT"):
                start = index
            if operator == utils.b_("ET"):
                end = index
            if start != 0 and end != 0:
                text.append(content.operations[start + 1:end])
                start, end = 0, 0

        return text
Beispiel #16
0
def dopage(page):
    content = page["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, pdf)

    text = u_("")
    for operands, operator in content.operations:
        # print operator, operands
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text + " "
        elif operator == b_("rg"):
            text += "\n"
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0] + " "
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += _text + " "
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += " "

    texts = text.split('. ')
    results = ''
    for i in range(len(texts)):
        try:
            results = results + translate(str(texts[i])) + "\n"
        except Exception as e:
            print
            e
    return results
Beispiel #17
0
    def read_unicode(self, obj):
        from PyPDF2.pdf import ContentStream
        content = None
        table = {}
        if not isinstance(obj, ContentStream):
            try:
                content = ContentStream(obj, obj.pdf)
            except ValueError:
                pass
        if content is not None:
            for operands, operator in content.operations:
                if operator == b'endbfchar' or operator == b'endbfrange':
                    count_el = 2 if operator == b'endbfchar' else 3
                    # table has two or three elements
                    for index in range(0, len(operands), count_el):
                        key = operands[index]
                        if not isinstance(key, ByteStringObject):
                            key = key.get_original_bytes()
                        key = key.hex()
                        value = operands[index + count_el - 1]
                        if not isinstance(value, ByteStringObject):
                            value = value.get_original_bytes()
                        value = convert(value)
                        table[key] = value
                        if count_el == 3 and operands[index] != operands[index
                                                                         + 1]:
                            start = operands[index].get_original_bytes().hex()
                            end = operands[index +
                                           1].get_original_bytes().hex()
                            # иногда указан диапазон значений, поэтому таблицу шрифтов
                            # дополняем динамически
                            for i in range(
                                    int(start, 16) + 1,
                                    int(end, 16) + 1):
                                key = hex(i).split('x')[-1]
                                value = chr(ord(value) + 1)
                                table[key] = value

        return table
Beispiel #18
0
 def remove_text_from_normal_page(self, pg, pdf):
     content_object = pg["/Contents"].getObject()
     content = ContentStream(content_object, pdf)
     flag = False
     for operands, operator in content.operations:
         if operator in [b_('TJ'), b_('Tj')]:
             if type(operands[0]) is list:
                 text = ''.join(
                     map(
                         lambda x: x
                         if isinstance(x, TextStringObject) else '',
                         operands[0]))
             else:
                 text = operands[0]
             if isinstance(text, TextStringObject) and text.startswith(
                     self.wmtext):
                 operands[0] = TextStringObject('')
                 flag = True
     pg[NameObject('/Contents')] = content
     if not flag and self.form:
         pg = self.remove_form_from_normal_page(pg)
     return pg
 def customExtractText(self):
     text = u_("")
     content = self["/Contents"].getObject()
     if not isinstance(content, ContentStream):
         content = ContentStream(content, self.pdf)
     # Note: we check all strings are TextStringObjects.  ByteStringObjects
     # are strings where the byte->string encoding was unknown, so adding
     # them to the text here would be gibberish.
     for operands, operator in content.operations:
         if operator == b_("Tj"):
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += _text
         elif operator == b_("T*"):
             text += "\n"
         elif operator == b_("'"):
             text += "\n"
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += operands[0]
         elif operator == b_('"'):
             _text = operands[2]
             if isinstance(_text, TextStringObject):
                 text += "\n"
                 text += _text
         elif operator == b_("TJ"):
             for i in operands[0]:
                 if isinstance(i, TextStringObject):
                     text += i
                 elif isinstance(i, FloatObject) or isinstance(i, NumberObject):
                     if i < -100:
                         text += " "
         elif operator == b_("TD") or operator == b_("Tm"):
             if len(text) > 0 and text[-1] != " " and text[-1] != "\n":
                 text += " "
     text = text.replace(" - ", "-")
     text = re.sub("\\s+", " ", text)
     return text
Beispiel #20
0
    def alt_extractText(self):
        """
        Locate text and include "\n"

        :return: a unicode string object.
        """
        pic = ""
        tic = "~"
        text = u_("")
        content = self["/Contents"].getObject()
        if not isinstance(content, ContentStream):
            content = ContentStream(content, self.pdf)
        for operands, operator in content.operations:
            if operator == b_("Tj"):
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    text += _text + pic
            elif operator == b_("T*"):
                text += "\n"
            elif operator == b_("'"):
                text += "\n"
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    text += operands[0]
            elif operator == b_('"'):
                _text = operands[2]
                if isinstance(_text, TextStringObject):
                    text += "\n"
                    text += _text
            elif operator == b_("TJ"):
                for i in operands[0]:
                    if isinstance(i, TextStringObject):
                        text += i
                text += "\n"
            else:
                text += tic
        return text
Beispiel #21
0
def extractText_alt_PageObject(self, Tj_sep="\n"):
    """
    Try new-lines...

    :return: a unicode string object.
    """
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += Tj_sep
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
            text += "\n"
    return text
Beispiel #22
0
    pdf = PdfFileReader(pdf_file)
    pages_per_part = math.floor(pdf.getNumPages() / parts)

    rule_index = 0
    current_text = ''
    current_color = 'White'
    current_rule_actions = []
    overflowing = None
    is_on_speech_person_font = False

    for part in range(parts):
        script = base_script

        for i in range(1 + pages_per_part * part, pages_per_part * (part + 1)):
            page: PageObject = pdf.getPage(i)
            contentStream = ContentStream(page.getContents().getObject(), pdf)
            for operands, operator in contentStream.operations:
                if operator == b'Tf':
                    is_on_speech_person_font = operands[0] == '/F1'
                elif operator == b'Tj':
                    text = operands[0].strip()
                    if text.strip() in ('', ':') or re.match(
                            r'Page \d+/123', text):
                        # is ignored
                        continue
                    elif re.match(r'^=*[A-Z0-9# ]+(?::|=+)$',
                                  text) or is_on_speech_person_font:
                        # is speech person
                        if len(current_rule_actions) != 0:
                            current_rule_actions.append(None)
def pageOperations(page):
    obj = page.getContents().getObject()
    # Trigger decoding
    obj.getData()
    content = ContentStream(obj.decodedSelf, page.pdf)
    return contentOperations(content)
Beispiel #24
0
    def removeWordStyle(self, ignoreByteStringObject=False):
        """
        Removes imported styles from Word - Path Constructors rectangles - from this output.

        :param bool ignoreByteStringObject: optional parameter
            to ignore ByteString Objects.
        """

        pages = self.getObject(self._pages)['/Kids']
        for j in range(len(pages)):
            page = pages[j]
            pageRef = self.getObject(page)
            content = pageRef['/Contents'].getObject()

            if not isinstance(content, ContentStream):
                content = ContentStream(content, pageRef)

            _operations = []
            last_font_size = 0

            for operator_index, (operands,
                                 operator) in enumerate(content.operations):

                if operator == b_('Tf') and operands[0][:2] == '/F':
                    last_font_size = operands[1].as_numeric()

                if operator == b_('Tj'):
                    text = operands[0]
                    if ignoreByteStringObject:
                        if not isinstance(text, TextStringObject):
                            operands[0] = TextStringObject()
                elif operator == b_("'"):
                    text = operands[0]
                    if ignoreByteStringObject:
                        if not isinstance(text, TextStringObject):
                            operands[0] = TextStringObject()
                elif operator == b_('"'):
                    text = operands[2]
                    if ignoreByteStringObject:
                        if not isinstance(text, TextStringObject):
                            operands[2] = TextStringObject()
                elif operator == b_("TJ"):
                    for i in range(len(operands[0])):
                        if ignoreByteStringObject:
                            if not isinstance(operands[0][i],
                                              TextStringObject):
                                operands[0][i] = TextStringObject()

                operator_type = self._getOperatorType(operator)

                # we are ignoring all grayscale colors
                # tests showed that black underlines, borders and tables are defined by grayscale and arn't using rgb/cmyk colors
                if operator_type == 'rgb' or operator_type == 'cmyk':

                    color_target_operation_type = self._getColorTargetOperationType(
                        operator_index, content.operations)

                    new_color = None

                    # we are coloring all text in black and all rectangles in white
                    # removing all colors paints rectangles in black which gives us unwanted results
                    if color_target_operation_type == 'text':
                        new_color = 'black'
                    elif color_target_operation_type == 'rectangle':
                        new_color = 'white'

                    if new_color:
                        operands = self.colors_operands[operator_type][
                            new_color]

                # remove styled rectangles (highlights, lines, etc.)
                # the 're' operator is a Path Construction operator, creates a rectangle()
                # presumably, that's the way word embedding all of it's graphics into a PDF when creating one
                if operator == b_('re'):

                    rectangle_width = operands[-2].as_numeric()
                    rectangle_height = operands[-1].as_numeric()

                    minWidth = self.getMinimumRectangleWidth(
                        last_font_size,
                        1)  # (length of X letters at the current size)
                    maxHeight = last_font_size + 6  # range to catch really big highlights
                    minHeight = 1.5  # so that thin lines will not be removed

                    # remove only style that:
                    # it's width are bigger than the minimum
                    # it's height is smaller than maximum and larger than minimum
                    if rectangle_width > minWidth and rectangle_height > minHeight and rectangle_height <= maxHeight:
                        continue

                _operations.append((operands, operator))

            content.operations = _operations
            pageRef.__setitem__(NameObject('/Contents'), content)
Beispiel #25
0
    """
    loc = np.array([i.as_numeric() for i in location])
    diff = np.abs(loc - target)
    return np.any(diff.max(axis=1) < epsilon)


# 从结果看,MS-Word加的水印,有些指令混在正常数据中,需要更精细调试处理
with open(with_wm_path, 'rb') as f, open(nowm_out_path, 'wb') as f_out:
    pdf = PdfFileReader(f)
    pdf_out = PdfFileWriter()
    # print(pdf.getDocumentInfo())
    cn_pages = pdf.getNumPages()
    for i in range(cn_pages):
        page = pdf.getPage(i)
        content = page.getContents()
        cs = ContentStream(content, pdf)
        for operands, operator in cs.operations:
            # `b_`只是python2/3中bytes类型转换的冗余代码
            if operator == b_('Tm') and match_location(operands, TARGET_TXT):
                operands[:] = []
            elif operator == b_('cm') and match_location(operands, TARGET_IMG):
                operands[:] = []
            elif operator == b_('gs'):
                if operands[0] == '/GS0':
                    operands[:] = []
            elif operator == b_('Do'):
                # 引用图片名称
                if operands[0] == '/Im0':
                    pass
                elif operands[0] == '/Fm0':
                    operands[:] = []
Beispiel #26
0
        def extractText(self, skip_intertwined_text=True):
            """
            Locate all text drawing commands, in the order they are provided in the
            content stream, and extract the text.  This works well for some PDF
            files, but poorly for others, depending on the generator used.  This will
            be refined in the future.  Do not rely on the order of text coming out of
            this function, as it will change if this function is made more
            sophisticated.

            :return: a unicode string object.
            """
            text = u_("")
            content = self["/Contents"].getObject()
            if not isinstance(content, ContentStream):
                content = ContentStream(content, self.pdf)
            # Note: we check all strings are TextStringObjects.  ByteStringObjects
            # are strings where the byte->string encoding was unknown, so adding
            # them to the text here would be gibberish.
            #
            indent = 0
            previous_width = 0
            skip_next = False
            for operands, operator in content.operations:
                if not operands:  # Empty operands list contributes no text
                    operands = [""]
                if operator == b_("Tj"):
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += _text
                elif operator == b_("T*"):
                    text += "\n"
                elif operator == b_("'"):
                    text += "\n"
                    _text = operands[0]
                    if isinstance(_text, TextStringObject):
                        text += operands[0]
                elif operator == b_('"'):
                    _text = operands[2]
                    if isinstance(_text, TextStringObject):
                        text += "\n"
                        text += _text
                elif operator == b_("TJ"):
                    if skip_intertwined_text and skip_next:
                        skip_next = False
                    else:
                        for i in operands[0]:
                            if isinstance(i, TextStringObject):
                                text += i
                                previous_width += len(i)
                            elif isinstance(i, FloatObject) or isinstance(i, NumberObject):
                                if text and (not text[-1] in " \n"):
                                    text += " " * int(i / -100)
                                    previous_width += int(i / -100)
                elif operator == b_("Td"):
                    indent = indent + operands[0]
                    if operands[1] == 0:
                        if int(operands[0] / 20) >= previous_width:
                            text += " " * (int(operands[0] / 20) - previous_width)
                        else:
                            skip_next = True
                            # If skip_intertwined_text is false, this will result in no space between the two 'lines'
                    else:
                        previous_width = 0
                        text += "\n" * max(0, int(operands[1] / -50)) + " " * max(0, int(indent / 20))
                elif operator == b_("Tm"):
                    indent = operands[4]
                    text += " " * max(0, int(indent / 20))
                elif operator == b_("TD") or operator == b_("Tm"):
                    if text and (not text[-1] in " \n"):
                        text += " "
            return text

def match_location(location, target, epsilon=1e-5):
    # targe must be n*6 numpy matrix
    return np.any(
        np.abs(np.array([i.as_numeric()
                         for i in location]) - target).max(axis=1) < epsilon)


for p in range(source.getNumPages()):
    page = source.getPage(p)
    # print(page.extractText())
    #content_object, = page["/Contents"][0].getObject()
    content_object = page["/Contents"][1]

    content = ContentStream(content_object, source)
    for operands, operator in content.operations:
        # print(operator, operands) # pdf元素的类型和值

        # 主要的代码在这里,使用各种方式找到水印可识别的特征
        # if operator == b_("TJ"): # `b_`只是python2/3中bytes类型转换的冗余代码
        #     text = operands[0][0]
        #     # if isinstance(text, bytes):
        #     #     print('====  ', text, '  ====')
        #     #     for c in guess_codes(text):
        #     #         print(c, text.decode(c))
        #     if isinstance(text, TextStringObject) and text in target_str:
        #         operands[0] = TextStringObject('')

        if operator == b_("cm") and match_location(operands, target_locations):
            operands[:] = []
Beispiel #28
0
def extract_ops(page: PageObject) -> List[Tuple]:
    """extract all operators"""
    content = page.getContents()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, page.pdf)
    return list(content.operations)
Beispiel #29
0
def generateNup(inPathOrFile, n, outPathPatternOrFile=None, dirs="RD",
                verbose=False):
    """Generate a N-up document version.

    If outPathPatternOrFile is None, the output will be written
    in a file named after the input file.
    """

    assert isSquare(n) or isHalfSquare(n)

    ipof = inPathOrFile
    oppof = outPathPatternOrFile

    if isFileLike(ipof):
        inFile = ipof
        if oppof is None:
            raise AssertionError("Must specify output for file input!")
        elif isFileLike(oppof):
            outFile = oppof
        elif type(oppof) in (str,):
            outPath = oppof
            outFile = open(outPath, "wb")
    elif type(ipof) in (str,):
        inFile = open(ipof, "rb")
        if isFileLike(oppof):
            outFile = oppof
        elif oppof is None or type(oppof) in (str,):
            if oppof is None:
                oppof = "%(dirname)s/%(base)s-%(n)dup%(ext)s"
            aDict = {
                "dirname": os.path.dirname(inPathOrFile) or ".",
                "basename": os.path.basename(inPathOrFile),
                "base": os.path.basename(os.path.splitext(inPathOrFile)[0]),
                "ext": os.path.splitext(inPathOrFile)[1],
                "n": n,
            }
            outPath = oppof % aDict
            outPath = os.path.normpath(outPath)
            outFile = open(outPath, "wb")

    # get info about source document
    docReader = PdfFileReader(inFile)
    numPages = docReader.getNumPages()
    oldPageSize = docReader.getPage(0).mediaBox.upperRight

    # create empty output document buffer
    if isSquare(n):
        newPageSize = oldPageSize
    elif isHalfSquare(n):
        newPageSize = oldPageSize[1], oldPageSize[0]
    np = numPages // n + numPages % n
    buf = exP1multiN(_mtA4Pdf, newPageSize, np)

    # calculate mini page areas
    rects = calcRects(newPageSize, n, dirs)

    # combine
    ops = []
    newPageNum = -1
    for i in range(numPages):
        if i % n == 0:
            newPageNum += 1
        op = (inPathOrFile, i, (0, 0, None, None), i // n, rects[i % n])
        ops.append(op)

    srcr = srcReader = PdfFileReader(inFile)
    srcPages = [srcr.getPage(i) for i in range(srcr.getNumPages())]

    if type(oppof) in (str,):
        outFile = open(outPath, "rb")
    outr = outReader = PdfFileReader(buf)
    outPages = [outr.getPage(i) for i in range(outr.getNumPages())]
    output = PdfFileWriter()

    mapping = {}
    for op in ops:
        dummy, dummy, dummy, destPageNum, dummy = op
        if destPageNum not in mapping:
            mapping[destPageNum] = []
        mapping[destPageNum].append(op)

    PO, AO, DO, NO = PageObject, ArrayObject, DictionaryObject, NameObject

    for destPageNum, ops in list(mapping.items()):
        for op in ops:
            inPathOrFile, srcPageNum, srcRect, destPageNum, destRect = op
            page2 = srcPages[srcPageNum]
            page1 = outPages[destPageNum]
            pageWidth, pageHeight = page2.mediaBox.upperRight
            destX, destY, destWidth, destHeight = destRect
            xScale, yScale = calcScalingFactors(
                destWidth, destHeight, pageWidth, pageHeight)

            newResources = DO()
            rename = {}
            orgResources = page1["/Resources"].getObject()
            page2Resources = page2["/Resources"].getObject()

            names = "ExtGState Font XObject ColorSpace Pattern Shading"
            for res in names.split():
                res = "/" + res
                new, newrename = PO._mergeResources(orgResources,
                                                    page2Resources, res)
                if new:
                    newResources[NO(res)] = new
                    rename.update(newrename)

            newResources[NO("/ProcSet")] = AO(
                frozenset(orgResources.get("/ProcSet", AO()).getObject()).union(
                    frozenset(page2Resources.get("/ProcSet", AO()).getObject())
                )
            )

            newContentArray = AO()
            orgContent = page1["/Contents"].getObject()
            newContentArray.append(PO._pushPopGS(orgContent, page1.pdf))
            page2Content = page2['/Contents'].getObject()
            page2Content = PO._contentStreamRename(page2Content, rename,
                                                   page1.pdf)
            page2Content = ContentStream(page2Content, page1.pdf)
            page2Content.operations.insert(0, [[], "q"])

            # handle rotation
            try:
                rotation = page2["/Rotate"].getObject()
            except KeyError:
                rotation = 0
            if rotation in (180, 270):
                dw, dh = destWidth, destHeight
                arr = [-xScale, 0, 0, -yScale, destX + dw, destY + dh]
            elif rotation in (0, 90):
                arr = [xScale, 0, 0, yScale, destX, destY]
            else:
                # treat any other (illegal) rotation as 0
                arr = [xScale, 0, 0, yScale, destX, destY]

            arr = [FloatObject(str(x)) for x in arr]
            page2Content.operations.insert(1, [arr, "cm"])
            page2Content.operations.append([[], "Q"])
            newContentArray.append(page2Content)
            page1[NO('/Contents')] = ContentStream(newContentArray, page1.pdf)
            page1[NO('/Resources')] = newResources

        output.addPage(page1)

    if type(oppof) in (str,):
        outFile = open(outPath, "wb")
    output.write(outFile)

    if verbose:
        if type(oppof) in (str,):
            print(("written: %s" % outPath))
        elif isFileLike:
            print("written to file-like input parameter")

    return outPath
def update(filename, template_page):
    if not filename.lower().endswith(".pdf"):
        print(f"Not a pdf file: {filename}")
        return

    try:
        data = PdfFileReader(open(filename, "rb"))
    except OSError as e:
        print(f"Can't open {filename}: {e}")
        return
    except PdfReadError as e:
        print(f"{filename} is not a valid PDF file: {e}")
        return

    info = data.getDocumentInfo()
    producer = None
    creator = None
    title = None
    if info:
        producer = info.get("/Producer", None)
        creator = info.get("/Creator", None)
        title = info.get("/Title", None)

    # Check if we've already filled this
    if producer == "PyPDF2" or producer == SELF_PRODUCER:
        print(f"Skipping {filename}: Already added CMS-1500")
        return

    if data.getNumPages() < 1:
        print(f"Skipping {filename}: No pages")

    output = PdfFileWriter()
    output.addMetadata(
        {"/Producer": codecs.BOM_UTF16_BE + SELF_PRODUCER.encode("utf-16be")})

    for page_no in range(data.getNumPages()):
        data_page = data.getPage(page_no)

        # If it's printed through the eBridge printer driver, it has an
        # image of the output with invisible text on top; look for those
        # and strip off the image
        if producer == "eBridgeToolkit 7.1":
            # Set a fixed-width font
            font = data_page[NameObject("/Resources")][NameObject("/Font")]
            if not NameObject("/T1_0") in font:
                print(
                    f"Skipping {filename}: it does not match the expected format (font name)"
                )
                return
            font[NameObject("/T1_0")][NameObject("/BaseFont")] = NameObject(
                "/Courier")

            # Remove the image that covers everything
            content = ContentStream(data_page["/Contents"].getObject(), data)
            ops = [op[1] for op in content.operations[0:5]]
            if ops != [b"q", b"cm", b"Do", b"Q", b"rg"]:
                print(
                    f"Skipping {filename}: it does not match the expected format (obscuring image)"
                )
                return
            del content.operations[0:5]

            # Remove the flag that makes the text hidden
            if content.operations[2] != ([3], b"Tr"):
                print(
                    f"Skipping {filename}: it does not match the expected format (font invisible)"
                )
                return
            del content.operations[2]

            # Write that back
            data_page[NameObject("/Contents")] = content
        elif creator == "Intergy" and title == "CMSPrePrinted1500212":
            # Nothing to do; these overlay just fine
            pass
        else:
            print(f"Skipping {filename}: Unknown PDF")
            return

        merged_page = copy.copy(template_page)
        merged_page.mergePage(data_page)
        output.addPage(merged_page)

    # Write the output to a temporary file, so that any failures in
    # writing don't affect the original
    output_file = NamedTemporaryFile()
    output.write(output_file)
    output_file.flush()
    shutil.copy(output_file.name, filename)

    print(f"Successfully processed {filename}")