Ejemplo n.º 1
0
def extract_text(self):
    text = u""
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    for operands, operator in content.operations:
        if operator == "Tj":
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == "T*":
            text += "\n"
        elif operator == "'":
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == '"':
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == "TJ":
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
        if text and not text.endswith(" "):
            text += " "  # Don't let words concatenate
    return text
Ejemplo n.º 2
0
def extractPDFText(self):
    text = u""
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == "Tj":
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == "T*":
            text += "\n"
        elif operator == "'":
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == '"':
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == "TJ":
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
        elif operator == "k":
            text += "\n"
    return text
Ejemplo n.º 3
0
def extractOperators(self):
    ops = []
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    for op in content.operations:
        ops.append(op)
    return ops
Ejemplo n.º 4
0
 def replace_text(cls, page, text, replace):
     # HACK
     from pyPdf.pdf import ContentStream, PageObject
     from pyPdf.generic import TextStringObject, NameObject
     content = ContentStream(page["/Contents"].getObject(), page.pdf)
     for idx in range(len(content.operations)):
         operands, operator = content.operations[idx]
         if operator == 'Tj':
             operands[0] = TextStringObject(operands[0].replace(
                 text, replace))
     new_page = PageObject.createBlankPage(page.pdf)
     new_page.mergePage(page)
     new_page[NameObject('/Contents')] = content
     return new_page
Ejemplo n.º 5
0
def pdf_add_content(content_string, page, scale=1, offsetx=0, offsety=0):
    """Add content to the end of the content stream of the PDF page.
    
    Inputs: content_string  The PDF drawing commands to add, as a single string.
            
            page            The pyPdf.pdf.PageObject to add the content to.
            
            scale           Before adding the content, adjust the the coordinate
            offsetx         system with a (uniform) scale factor and a
            offsety         translation of offsetx and offsety.
    
    """
    coord_trans = '%.2f 0 0 %.2f %.2f %.2f cm' % (scale, scale, offsetx,
                                                  offsety)
    commands = '\n'.join(('Q', 'q', coord_trans, content_string, 'Q'))

    try:
        orig_content = page['/Contents'].getObject()
    except KeyError:
        orig_content = ArrayObject([])
    stream = ContentStream(orig_content, page.pdf)
    stream.operations.insert(0, [[], 'q'])  # Existing content may not restore
    stream.operations.append([[], commands])  # graphics state at the end.
    page[NameObject('/Contents')] = stream
Ejemplo n.º 6
0
def extract_text(self):
    """ Patched extractText() from pyPdf to put spaces between different text snippets.
    """
    text = u""
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == "Tj":
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == "T*":
            text += "\n"
        elif operator == "'":
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == '"':
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == "TJ":
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i

        if text and not text.endswith(" "):
            text += " "  # Don't let words concatenate

    return text
Ejemplo n.º 7
0
 def InsertXObject(self, name):
     " XObject can be an image or a 'form' (an arbitrary PDF sequence) "
     dlist = []
     xobject = self.page["/Resources"].getObject()['/XObject']
     stream = xobject[name]
     if stream.get('/Subtype') == '/Form':
         # insert contents into current page drawing
         if not name in self.formdrawings:       # extract if not already done
             pdf_fonts = self.FetchFonts(stream)
             bbox = stream.get('/BBox')
             matrix = stream.get('/Matrix')
             form_ops = ContentStream(stream, self.pdfdoc).operations
             oplist = [([], 'q'), (matrix, 'cm')]    # push state & apply matrix
             oplist.extend(form_ops)                 # add form contents
             oplist.append(([], 'Q'))                # restore original state
             self.formdrawings[name] = self.ProcessOperators(oplist, pdf_fonts)
         dlist.extend(self.formdrawings[name])
     elif stream.get('/Subtype') == '/Image':
         width = stream.get('/Width') 
         height = stream.get('/Height')
         depth = stream.get('/BitsPerComponent')
         filters = stream.get("/Filter", ())
         dlist.append(self.AddBitmap(stream._data, width, height, filters))
     return dlist