def test_objectlist_repr(pal): cs = pikepdf.parse_content_stream(pal.pages[0].Contents) assert isinstance(cs[1][0], pikepdf._qpdf._ObjectList) ol = cs[1][0] assert ( "[Decimal('144.0000'), 0, 0, Decimal('144.0000'), Decimal('0.0000'), Decimal('0.0000')]" in repr(ol))
def test_text_filter(resources, outdir): input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf' # Ensure the test PDF has detect we can find proc = run(['pdftotext', str(input_pdf), '-'], check=True, stdout=PIPE, encoding='utf-8') assert proc.stdout.strip() != '', "Need input test file that contains text" pdf = Pdf.open(input_pdf) page = pdf.pages[0] keep = [] for operands, command in parse_content_stream(page): if command == Operator('Tj'): print("skipping Tj") continue keep.append((operands, command)) new_stream = Stream(pdf, keep) print(new_stream.read_bytes()) # pylint: disable=no-member page['/Contents'] = new_stream page['/Rotate'] = 90 pdf.save(outdir / 'notext.pdf', True) proc = run( ['pdftotext', str(outdir / 'notext.pdf'), '-'], check=True, stdout=PIPE, encoding='utf-8', ) assert proc.stdout.strip() == '', "Expected text to be removed"
def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] rich_page = Page(page) rich_page.contents_coalesce() for operands, operator in parse_content_stream(page, ''): if not in_text_obj: if operator == Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() content_stream = unparse_content_stream(stream) page.Contents = Stream(pdf, content_stream)
def test_inline_copy(inline): for instr in parse_content_stream(inline.pages[0].Contents): if not isinstance(instr, ContentStreamInlineImage): continue csiimage = instr _copy_of_csiimage = ContentStreamInlineImage(csiimage) # noqa: F841 new_iimage = ContentStreamInlineImage(csiimage.iimage) assert unparse_content_stream([new_iimage]).startswith(b'BI')
def test_inline(inline): iimage, pdf = inline assert iimage.width == 8 assert iimage.image_mask == False assert iimage.mode == 'RGB' assert iimage.is_inline assert iimage.colorspace == '/DeviceRGB' unparsed = iimage.unparse() cs = pdf.make_stream(unparsed) for operands, _command in parse_content_stream(cs): if operands and isinstance(operands[0], PdfInlineImage): reparsed_iim = operands[0] assert reparsed_iim == iimage
def test_invalid_stream_object(): with pytest.raises(TypeError): parse_content_stream(42) with pytest.raises(TypeError): parse_content_stream(Dictionary({"/Hi": 3})) with pytest.raises(PdfError): false_page = Dictionary(Type=Name.Page, Contents=42) parse_content_stream(false_page)
def _simple_interpret_content_stream(page: Union[Page, Object]): ctm = PdfMatrix.identity() stack: List[PdfMatrix] = [] for instruction in parse_content_stream(page, operators='q Q cm Do'): if isinstance(instruction, ContentStreamInlineImage): continue operands, op = instruction.operands, instruction.operator if op == Operator('q'): stack.append(ctm) elif op == Operator('Q'): ctm = stack.pop() elif op == Operator('cm'): ctm = PdfMatrix(operands) @ ctm elif op == Operator('Do'): xobj_name = operands[0] yield (xobj_name, ctm)
def test_parse_results(inline): p0 = inline.pages[0] cmds = parse_content_stream(p0) assert isinstance(cmds[0], ContentStreamInstruction) csi = cmds[0] assert isinstance(csi.operands, _qpdf._ObjectList) assert isinstance(csi.operator, Operator) assert 'Operator' in repr(csi) assert ContentStreamInstruction(cmds[0]).operator == cmds[0].operator for cmd in cmds: if isinstance(cmd, ContentStreamInlineImage): assert cmd.operator == Operator("INLINE IMAGE") assert isinstance(cmd.operands[0], PdfInlineImage) assert 'INLINE' in repr(cmd) assert cmd.operands[0] == cmd.iimage
def test_invalid_stream_object(): with pytest.raises(TypeError, match="must be a pikepdf.Object"): parse_content_stream(42) with pytest.raises(TypeError, match="called on page or stream"): parse_content_stream(Dictionary({"/Hi": 3})) with pytest.raises( TypeError, match="parse_content_stream called on non-stream Object"): false_page = Dictionary(Type=Name.Page, Contents=42) parse_content_stream(false_page)
def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] rich_page = pikepdf.Page(page) rich_page.contents_coalesce() for operands, operator in pikepdf.parse_content_stream(page, ''): if not in_text_obj: if operator == pikepdf.Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == pikepdf.Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == pikepdf.Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() def convert(op): try: return op.unparse() except AttributeError: return str(op).encode('ascii') lines = [] for operands, operator in stream: if operator == pikepdf.Operator('INLINE IMAGE'): iim = operands[0] line = iim.unparse() else: line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse() lines.append(line) content_stream = b'\n'.join(lines) page.Contents = pikepdf.Stream(pdf, content_stream)
def test_inline(inline): iimage, pdf = inline assert iimage.width == 8 assert iimage.image_mask == False assert iimage.mode == 'RGB' assert iimage.is_inline assert iimage.colorspace == '/DeviceRGB' assert 'PdfInlineImage' in repr(iimage) unparsed = iimage.unparse() assert b'/W 8' in unparsed, "inline images should have abbreviated metadata" assert b'/Width 8' not in unparsed, "abbreviations expanded in inline image" cs = pdf.make_stream(unparsed) for operands, _command in parse_content_stream(cs): if operands and isinstance(operands[0], PdfInlineImage): reparsed_iim = operands[0] assert reparsed_iim == iimage
def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] page.page_contents_coalesce() for operands, operator in pikepdf.parse_content_stream(page, ""): if not in_text_obj: if operator == pikepdf.Operator("BT"): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == pikepdf.Operator("Tr"): render_mode = operands[0] text_objects.append((operands, operator)) if operator == pikepdf.Operator("ET"): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() def convert(op): try: return op.unparse() except AttributeError: return str(op).encode("ascii") lines = [] for operands, operator in stream: if operator == pikepdf.Operator("INLINE IMAGE"): iim = operands[0] line = iim.unparse() else: line = b" ".join(convert(op) for op in operands) + b" " + operator.unparse() lines.append(line) content_stream = b"\n".join(lines) page.Contents = pikepdf.Stream(pdf, content_stream)
def strip_invisible_text(pdf, page, log): stream = [] in_text_obj = False render_mode = 0 text_objects = [] page.page_contents_coalesce() for operands, operator in pikepdf.parse_content_stream(page, ''): if not in_text_obj: if operator == pikepdf.Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == pikepdf.Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == pikepdf.Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() def convert(op): try: return op.unparse() except AttributeError: return str(op).encode('ascii') lines = [] for operands, operator in stream: if operator == pikepdf.Operator('INLINE IMAGE'): iim = operands[0] line = iim.unparse() else: line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse() lines.append(line) content_stream = b'\n'.join(lines) page.Contents = pikepdf.Stream(pdf, content_stream)
def parse_text(qpdf_page: pikepdf.Page, font_map, synthesizer: PdfSynthesizer): content_stream = iter(pikepdf.parse_content_stream(qpdf_page)) new_content_stream = [] last_used_font = None text_lengths = collections.Counter() for operands, operator in content_stream: if operator == pikepdf.Operator('Do'): if has_form(qpdf_page, operands): raise HasFormException if operator == pikepdf.Operator('Tf'): last_used_font = _parse_font(operands, font_map) if operator == pikepdf.Operator('BT'): text_block, last_used_font = _parse_text_block( font_map=font_map, start=(operands, operator), content_stream=content_stream, current_font=last_used_font, ) for text_id, text, font in text_block: text_lengths[len(text)] += 1 modified_text = synthesizer.modify_text(text, font=font) text_block.set_unicode_text(text_id, modified_text) new_content_stream.extend(text_block.content_stream) else: new_content_stream.append((operands, operator)) single_chars = text_lengths[1] / sum(text_lengths.values()) if single_chars > 0.9: raise TooManySingleChars( f'Too many single characters in document ({single_chars * 100:.2f}%)' ) return new_content_stream
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE): """Interpret the PDF content stream The stack represents the state of the PDF graphics stack. We are only interested in the current transformation matrix (CTM) so we only track this object; a full implementation would need to track many other items. The CTM is initialized to the mapping from user space to device space. PDF units are 1/72". In a PDF viewer or printer this matrix is initialized to the transformation to device space. For example if set to (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches. Images are always considered to be (0, 0) -> (1, 1). Before drawing an image there should be a 'cm' that sets up an image coordinate system where drawing from (0, 0) -> (1, 1) will draw on the desired area of the page. PDF units suit our needs so we initialize ctm to the identity matrix. """ stack = [] ctm = PdfMatrix(initial_shorthand) xobject_settings = [] inline_images = [] found_text = False text_operators = set(['Tj', 'TJ', '"', "'"]) operator_whitelist = """q Q Do cm TJ Tj " ' BI ID EI""" for n, op in enumerate( _normalize_stack( pikepdf.parse_content_stream(contentstream, operator_whitelist))): operands, command = op if command == 'q': stack.append(ctm) if len(stack) > 32: raise RuntimeError("PDF graphics stack overflow, command %i" % n) elif command == 'Q': try: ctm = stack.pop() except IndexError: raise RuntimeError("PDF graphics stack underflow, command %i" % n) elif command == 'cm': ctm = PdfMatrix(operands) @ ctm elif command == 'Do': image_name = operands[0] settings = XobjectSettings(name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)) xobject_settings.append(settings) elif command == 'INLINE IMAGE': iimage = operands[0] inline = InlineSettings(iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)) inline_images.append(inline) elif command in text_operators: found_text = True return ContentsInfo(xobject_settings=xobject_settings, inline_images=inline_images, found_text=found_text)
def test_unparse_inline(inline): p0 = inline.pages[0] cmds = parse_content_stream(p0) unparsed = unparse_content_stream(cmds) assert b'BI' in unparsed assert unparsed == slow_unparse_content_stream(cmds)
def test_parse_xobject(resources): with Pdf.open(resources / 'formxobject.pdf') as pdf: form1 = pdf.pages[0].Resources.XObject.Form1 instructions = parse_content_stream(form1) assert instructions[0][1] == Operator('cm')
def filter_content(self, content, layer=None): # content can be either a page or an xobject if '/Resources' in content.keys(): page_keep = self.find_page_keep(content.Resources) else: page_keep = {} commands = pikepdf.parse_content_stream(content) show_ops = [ pikepdf.Operator(k) for k, v in pdf_ops.ops.items() if v[0] == 'show' ] stroke_ops = [ pikepdf.Operator(k) for k, v in pdf_ops.ops.items() if v[0] == 'show' and v[1] == 'stroke' ] new_content = [] in_oc = False currently_copying = self.keep_non_oc gs_mod = [] new_q = False if layer is not None: layer_mod, mod_applied = self.convert_layer_props( self.line_props[layer]) in_oc = True currently_copying = True else: layer_mod = None mod_applied = None for operands, operator in commands: # check to see if this pdf has CMYK or RGB colour definitions if not self.colour_type: self.check_colour(operator, operands) # look for optional content if layer is None and operator == pikepdf.Operator('BDC'): # BDC/BMC doesn't necessarily mean optional content block # check the operands for the /OC flag if len(operands) > 1 and operands[0] == '/OC': in_oc = True if operands[1] in page_keep.keys(): currently_copying = True # get a link to the current line property modifications requested if page_keep[operands[1]] in self.line_props.keys(): layer_mod, mod_applied = self.convert_layer_props( self.line_props[page_keep[operands[1]]]) else: currently_copying = False # all kinds of crazy stuff going on behind the scenes, so to select layers we can't just delete everything. # Just copy the non-showing operations if currently_copying or operator not in show_ops: new_command = [operands, operator] if in_oc and layer_mod is not None: op_string = str(operator) # if we need to modify graphics state dictionaries, we need to retrieve that from the resources if op_string == 'gs' and str(operands) not in gs_mod: gs_mod.append(operands) # check for one of the line property modification operators if op_string in layer_mod.keys(): new_command[0] = layer_mod[op_string] mod_applied[op_string] = True # check if we're drawing but haven't applied all mods yet if operator in stroke_ops and not all( mod_applied.values()): needs_mod = [ k for k, v in mod_applied.items() if not v ] for key in needs_mod: new_content.append( [layer_mod[key], pikepdf.Operator(key)]) mod_applied[key] = True if op_string == 'Q': # reset the dictionary if we're in a new q/Q block if all(mod_applied.values()): mod_applied = { key: False for key in mod_applied.keys() } new_content.append(new_command) # q is the only command that needs to go after the current command if new_q: new_content.append([[], pikepdf.Operator('q')]) new_q = False if in_oc and operator == pikepdf.Operator('EMC'): currently_copying = self.keep_non_oc in_oc = False layer_mod = None if len(gs_mod) > 0: print( 'Found graphics state dictionary, layer modification may not work as expected' ) return pikepdf.unparse_content_stream(new_content)
def test_unparse_inline(resources): with Pdf.open(resources / 'image-mono-inline.pdf') as pdf: p0 = pdf.pages[0] cmds = parse_content_stream(p0) unparsed = unparse_content_stream(cmds) assert b'BI' in unparsed
def test_invalid_stream_object(): with pytest.raises(TypeError): parse_content_stream(Dictionary({"/Hi": 3}))
def inline(resources): pdf = Pdf.open(resources / 'image-mono-inline.pdf') for operands, _command in parse_content_stream(pdf.pages[0]): if operands and isinstance(operands[0], PdfInlineImage): return operands[0], pdf
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE): """Interpret the PDF content stream. The stack represents the state of the PDF graphics stack. We are only interested in the current transformation matrix (CTM) so we only track this object; a full implementation would need to track many other items. The CTM is initialized to the mapping from user space to device space. PDF units are 1/72". In a PDF viewer or printer this matrix is initialized to the transformation to device space. For example if set to (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches. Images are always considered to be (0, 0) -> (1, 1). Before drawing an image there should be a 'cm' that sets up an image coordinate system where drawing from (0, 0) -> (1, 1) will draw on the desired area of the page. PDF units suit our needs so we initialize ctm to the identity matrix. According to the PDF specification, the maximum stack depth is 32. Other viewers tolerate some amount beyond this. We issue a warning if the stack depth exceeds the spec limit and set a hard limit beyond this to bound our memory requirements. If the stack underflows behavior is undefined in the spec, but we just pretend nothing happened and leave the CTM unchanged. """ stack = [] ctm = PdfMatrix(initial_shorthand) xobject_settings = [] inline_images = [] name_index = defaultdict(lambda: []) found_vector = False found_text = False vector_ops = set('S s f F f* B B* b b*'.split()) text_showing_ops = set("""TJ Tj " '""".split()) image_ops = set('BI ID EI q Q Do cm'.split()) operator_whitelist = ' '.join(vector_ops | text_showing_ops | image_ops) for n, graphobj in enumerate( _normalize_stack( pikepdf.parse_content_stream(contentstream, operator_whitelist) ) ): operands, operator = graphobj if operator == 'q': stack.append(ctm) if len(stack) > 32: # See docstring if len(stack) > 128: raise RuntimeError( "PDF graphics stack overflowed hard limit, operator %i" % n ) warn("PDF graphics stack overflowed spec limit") elif operator == 'Q': try: ctm = stack.pop() except IndexError: # Keeping the ctm the same seems to be the only sensible thing # to do. Just pretend nothing happened, keep calm and carry on. warn("PDF graphics stack underflowed - PDF may be malformed") elif operator == 'cm': ctm = PdfMatrix(operands) @ ctm elif operator == 'Do': image_name = operands[0] settings = XobjectSettings( name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack) ) xobject_settings.append(settings) name_index[image_name].append(settings) elif operator == 'INLINE IMAGE': # BI/ID/EI are grouped into this iimage = operands[0] inline = InlineSettings( iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack) ) inline_images.append(inline) elif operator in vector_ops: found_vector = True elif operator in text_showing_ops: found_text = True return ContentsInfo( xobject_settings=xobject_settings, inline_images=inline_images, found_vector=found_vector, found_text=found_text, name_index=name_index, )
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE): """Interpret the PDF content stream. The stack represents the state of the PDF graphics stack. We are only interested in the current transformation matrix (CTM) so we only track this object; a full implementation would need to track many other items. The CTM is initialized to the mapping from user space to device space. PDF units are 1/72". In a PDF viewer or printer this matrix is initialized to the transformation to device space. For example if set to (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches. Images are always considered to be (0, 0) -> (1, 1). Before drawing an image there should be a 'cm' that sets up an image coordinate system where drawing from (0, 0) -> (1, 1) will draw on the desired area of the page. PDF units suit our needs so we initialize ctm to the identity matrix. According to the PDF specification, the maximum stack depth is 32. Other viewers tolerate some amount beyond this. We issue a warning if the stack depth exceeds the spec limit and set a hard limit beyond this to bound our memory requirements. If the stack underflows behavior is undefined in the spec, but we just pretend nothing happened and leave the CTM unchanged. """ stack = [] ctm = PdfMatrix(initial_shorthand) xobject_settings = [] inline_images = [] found_vector = False vector_ops = set('S s f F f* B B* b b*'.split()) image_ops = set('BI ID EI q Q Do cm'.split()) operator_whitelist = ' '.join(vector_ops | image_ops) for n, graphobj in enumerate( _normalize_stack( pikepdf.parse_content_stream(contentstream, operator_whitelist) ) ): operands, operator = graphobj if operator == 'q': stack.append(ctm) if len(stack) > 32: # See docstring if len(stack) > 128: raise RuntimeError( "PDF graphics stack overflowed hard limit, operator %i" % n ) warn("PDF graphics stack overflowed spec limit") elif operator == 'Q': try: ctm = stack.pop() except IndexError: # Keeping the ctm the same seems to be the only sensible thing # to do. Just pretend nothing happened, keep calm and carry on. warn("PDF graphics stack underflowed - PDF may be malformed") elif operator == 'cm': ctm = PdfMatrix(operands) @ ctm elif operator == 'Do': image_name = operands[0] settings = XobjectSettings( name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack) ) xobject_settings.append(settings) elif operator == 'INLINE IMAGE': # BI/ID/EI are grouped into this iimage = operands[0] inline = InlineSettings( iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack) ) inline_images.append(inline) elif operator in vector_ops: found_vector = True return ContentsInfo( xobject_settings=xobject_settings, inline_images=inline_images, found_vector=found_vector, )