コード例 #1
0
ファイル: test_parsers.py プロジェクト: yeus/pikepdf
def test_unparse_cs():
    instructions = [
        ([], Operator('q')),
        ([*PdfMatrix.identity().shorthand], Operator('cm')),
        ([], Operator('Q')),
    ]
    assert unparse_content_stream(instructions).strip() == b'q\n1 0 0 1 0 0 cm\n Q'
コード例 #2
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    rich_page = Page(page)
    rich_page.contents_coalesce()
    for operands, operator in parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    content_stream = unparse_content_stream(stream)
    page.Contents = Stream(pdf, content_stream)
コード例 #3
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
def test_inline_copy(inline):
    for instr in parse_content_stream(inline.pages[0].Contents):
        if not isinstance(instr, ContentStreamInlineImage):
            continue
        csiimage = instr
        _copy_of_csiimage = ContentStreamInlineImage(csiimage)  # noqa: F841
        new_iimage = ContentStreamInlineImage(csiimage.iimage)
        assert unparse_content_stream([new_iimage]).startswith(b'BI')
コード例 #4
0
ファイル: test_parsers.py プロジェクト: jeromerobert/pikepdf
def test_unparse_interpret_operator():
    commands = []
    matrix = [2, 0, 0, 2, 0, 0]
    commands.insert(0, (matrix, 'cm'))
    commands.insert(0, (matrix, b'cm'))
    commands.insert(0, (matrix, Operator('cm')))
    assert (unparse_content_stream(commands) ==
            b'2 0 0 2 0 0 cm\n2 0 0 2 0 0 cm\n2 0 0 2 0 0 cm')
コード例 #5
0
ファイル: test_parsers.py プロジェクト: yeus/pikepdf
def test_text_filter(resources, outdir):
    input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf'

    # Ensure the test PDF has detect we can find
    proc = run(
        ['pdftotext', str(input_pdf), '-'], check=True, stdout=PIPE, encoding='utf-8'
    )
    assert proc.stdout.strip() != '', "Need input test file that contains text"

    pdf = Pdf.open(input_pdf)
    page = pdf.pages[0]

    keep = []
    for operands, command in parse_content_stream(
        page, """TJ Tj ' " BT ET Td TD Tm T* Tc Tw Tz TL Tf Tr Ts"""
    ):
        if command == Operator('Tj'):
            print("skipping Tj")
            continue
        keep.append((operands, command))

    new_stream = Stream(pdf, pikepdf.unparse_content_stream(keep))
    print(new_stream.read_bytes())  # pylint: disable=no-member
    page['/Contents'] = new_stream
    page['/Rotate'] = 90

    pdf.save(outdir / 'notext.pdf', True)
    pdf.close()

    proc = run(
        ['pdftotext', str(outdir / 'notext.pdf'), '-'],
        check=True,
        stdout=PIPE,
        encoding='utf-8',
    )

    assert proc.stdout.strip() == '', "Expected text to be removed"
コード例 #6
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
 def test_rejects_inline_image_missing(self):
     with pytest.raises(PdfParsingError):
         unparse_content_stream([('should be a PdfInlineImage but is not',
                                  b'INLINE IMAGE')])
コード例 #7
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
 def test_rejects_not_operator(self):
     with pytest.raises(PdfParsingError, match="While unparsing"):
         unparse_content_stream([(['one', 'two'], Name.FortyTwo)
                                 ])  # Name is not an operator
コード例 #8
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
 def test_rejects_not_castable_to_object(self):
     with pytest.raises(PdfParsingError, match="While unparsing"):
         unparse_content_stream([(['one',
                                   'two'], 42)])  # 42 is not an operator
コード例 #9
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
 def test_rejects_not_list_of_pairs(self):
     with pytest.raises(PdfParsingError):
         unparse_content_stream([(1, 2, 3)])
コード例 #10
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
def test_unparse_invalid_inline_image():
    instructions = [((42, ), Operator(b'INLINE IMAGE'))]

    with pytest.raises(PdfParsingError):
        unparse_content_stream(instructions)
コード例 #11
0
def synthesize_pdf(
    pdf_file,
    json_file,
    dst_dir,
    max_fonts,
    max_pages,
    num_outputs_per_document,
    synthesizer_class,
):
    ground_truth = json.loads(json_file.read_text())
    pdf_io = BytesIO(pdf_file.read_bytes())
    output_string = StringIO()
    rsrcmgr = PDFResourceManager(caching=True)
    device = TextConverter(rsrcmgr,
                           output_string,
                           codec='utf-8',
                           laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    interpreter_fonts = {}

    def _out_path(_i, suffix):
        return dst_dir / f'{json_file.stem}-{_i}{suffix}'

    k_to_process = []
    for i in range(num_outputs_per_document):
        if not (_out_path(i, '.pdf').exists()
                and _out_path(i, '.json').exists()):
            k_to_process.append(i)

    if not k_to_process:
        raise AlreadyProcessed(f'Already processed {pdf_file} {json_file}')

    with pikepdf.Pdf.open(pdf_file) as pdf:
        if max_pages and len(pdf.pages) > max_pages:
            raise TooManyPagesException(
                f'Too many pages {len(pdf.pages)} > {max_pages} in PDF, skipping!'
            )

        for page_number, (page, miner) in enumerate(
                zip(pdf.pages, PDFPage.get_pages(pdf_io))):
            interpreter.process_page(miner)
            interpreter_fonts.update(interpreter.fontmap)

    if max_fonts and len(interpreter_fonts) > max_fonts:
        raise TooManyFontsException(
            f'Too many fonts {len(interpreter_fonts)} > {max_fonts} in PDF, skipping!'
        )

    if not re.sub(f'[{re.escape(string.whitespace)}]', '',
                  output_string.getvalue()):
        raise NoTextException('PDF does not have any text! Skipping')

    font_map = {
        f'/{k}': Font(f'/{k}', v)
        for k, v in interpreter_fonts.items()
    }
    synthesizer = synthesizer_class(ground_truth, font_map)

    with pikepdf.Pdf.open(pdf_file) as pdf:
        new_contents = collections.defaultdict(list)
        new_ground_truths = {}

        for i in k_to_process:
            for page_number, page in enumerate(pdf.pages):
                new_content_stream = parse_text(page, font_map, synthesizer)
                new_contents[i].append(
                    pdf.make_stream(
                        pikepdf.unparse_content_stream(new_content_stream)))

            new_ground_truths[i] = synthesizer.create_new_ground_truth()
            synthesizer.reset()

        for i in k_to_process:
            for page_number, page in enumerate(pdf.pages):
                page.Contents = new_contents[i][page_number]

            pdf.save(_out_path(i, '.pdf'))
            _out_path(i, '.json').write_text(
                json.dumps(new_ground_truths[i], indent=2))
コード例 #12
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
def test_build_instructions():
    cs = ContentStreamInstruction([1, 0, 0, 1, 0, 0], Operator('cm'))
    assert 'cm' in repr(cs)
    assert unparse_content_stream([cs]) == b'1 0 0 1 0 0 cm'
コード例 #13
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
def test_unparse_failure():
    instructions = [([float('nan')], Operator('cm'))]
    with pytest.raises(PdfParsingError):
        unparse_content_stream(instructions)
コード例 #14
0
ファイル: test_parsers.py プロジェクト: yeus/pikepdf
def test_unparse_inline(resources):
    with Pdf.open(resources / 'image-mono-inline.pdf') as pdf:
        p0 = pdf.pages[0]
        cmds = parse_content_stream(p0)
        unparsed = unparse_content_stream(cmds)
        assert b'BI' in unparsed
コード例 #15
0
    def filter_content(self, content, layer=None):
        # content can be either a page or an xobject
        if '/Resources' in content.keys():
            page_keep = self.find_page_keep(content.Resources)
        else:
            page_keep = {}

        commands = pikepdf.parse_content_stream(content)
        show_ops = [
            pikepdf.Operator(k) for k, v in pdf_ops.ops.items()
            if v[0] == 'show'
        ]
        stroke_ops = [
            pikepdf.Operator(k) for k, v in pdf_ops.ops.items()
            if v[0] == 'show' and v[1] == 'stroke'
        ]
        new_content = []
        in_oc = False
        currently_copying = self.keep_non_oc
        gs_mod = []
        new_q = False

        if layer is not None:
            layer_mod, mod_applied = self.convert_layer_props(
                self.line_props[layer])
            in_oc = True
            currently_copying = True
        else:
            layer_mod = None
            mod_applied = None

        for operands, operator in commands:
            # check to see if this pdf has CMYK or RGB colour definitions
            if not self.colour_type:
                self.check_colour(operator, operands)

            # look for optional content
            if layer is None and operator == pikepdf.Operator('BDC'):
                # BDC/BMC doesn't necessarily mean optional content block
                # check the operands for the /OC flag
                if len(operands) > 1 and operands[0] == '/OC':
                    in_oc = True
                    if operands[1] in page_keep.keys():
                        currently_copying = True

                        # get a link to the current line property modifications requested
                        if page_keep[operands[1]] in self.line_props.keys():
                            layer_mod, mod_applied = self.convert_layer_props(
                                self.line_props[page_keep[operands[1]]])
                    else:
                        currently_copying = False

            # all kinds of crazy stuff going on behind the scenes, so to select layers we can't just delete everything.
            # Just copy the non-showing operations
            if currently_copying or operator not in show_ops:
                new_command = [operands, operator]

                if in_oc and layer_mod is not None:
                    op_string = str(operator)

                    # if we need to modify graphics state dictionaries, we need to retrieve that from the resources
                    if op_string == 'gs' and str(operands) not in gs_mod:
                        gs_mod.append(operands)

                    # check for one of the line property modification operators
                    if op_string in layer_mod.keys():
                        new_command[0] = layer_mod[op_string]
                        mod_applied[op_string] = True

                    # check if we're drawing but haven't applied all mods yet
                    if operator in stroke_ops and not all(
                            mod_applied.values()):
                        needs_mod = [
                            k for k, v in mod_applied.items() if not v
                        ]
                        for key in needs_mod:
                            new_content.append(
                                [layer_mod[key],
                                 pikepdf.Operator(key)])
                            mod_applied[key] = True

                    if op_string == 'Q':
                        # reset the dictionary if we're in a new q/Q block
                        if all(mod_applied.values()):
                            mod_applied = {
                                key: False
                                for key in mod_applied.keys()
                            }

                new_content.append(new_command)

                # q is the only command that needs to go after the current command
                if new_q:
                    new_content.append([[], pikepdf.Operator('q')])
                    new_q = False

            if in_oc and operator == pikepdf.Operator('EMC'):
                currently_copying = self.keep_non_oc
                in_oc = False
                layer_mod = None

        if len(gs_mod) > 0:
            print(
                'Found graphics state dictionary, layer modification may not work as expected'
            )

        return pikepdf.unparse_content_stream(new_content)
コード例 #16
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
 def test_accepts_all_lists(self):
     unparse_content_stream([[[], b'Q']])
コード例 #17
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
 def test_accepts_all_tuples(self):
     unparse_content_stream((((Name.Foo, ), b'/Do'), ))
コード例 #18
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
def test_unparse_inline(inline):
    p0 = inline.pages[0]
    cmds = parse_content_stream(p0)
    unparsed = unparse_content_stream(cmds)
    assert b'BI' in unparsed
    assert unparsed == slow_unparse_content_stream(cmds)