def test_parser_exception(resources): pdf = Pdf.open(resources / 'graph.pdf') stream = pdf.pages[0]['/Contents'] with pytest.raises(ValueError): Object._parse_stream(stream, ExceptionParser())
def outlines(resources): return Pdf.open(resources / 'outlines.pdf')
def test_has_text(resources, test_file, expected): pdf = Pdf.open(resources / test_file) for p in pdf.pages: page = Page(p) assert page.has_text() == expected
def test_open_pdf_password(resources): pdf = Pdf.open(resources / 'graph-encrypted.pdf', password='******') assert pdf.root['/Pages']['/Count'] == 1
def sandwich(resources): # Has XMP, docinfo, <?adobe-xap-filters esc="CRLF"?>, shorthand attribute XMP with Pdf.open(resources / 'sandwich.pdf') as pdf: yield pdf
def test_open_pdf_password_encoding(self, resources): with pytest.raises(PasswordError): Pdf.open(resources / 'graph-encrypted.pdf', password=b'\x01\xfe')
def test_stream(self, resources): with (resources / 'pal-1bit-trivial.pdf').open('rb') as stream: pdf = Pdf.open(stream) assert pdf.root.Pages.Count == 1
def test_file_descriptor(resources): with (resources / 'pal-1bit-trivial.pdf').open('rb') as f: with pytest.raises(TypeError): Pdf.open(f.fileno())
def test_some_permissions_missing(self, resources): pdf = Pdf.open(resources / 'graph-encrypted.pdf', 'owner') assert pdf.allow.print_highres == pdf.allow.modify_annotation == False
def test_attr_access(resources): with Pdf.open(resources / 'graph.pdf') as pdf: assert int(pdf.Root.Pages.Count) == 1
def test_read_not_readable_file(self, outdir): writable = (Path(outdir) / 'writeme.pdf').open('wb') with pytest.raises(ValueError, match=r'not readable'): Pdf.open(writable)
def compile( exam, json, md, seed, subtitle, with_solutions, exam_type, semester, json_out, merged_md, draft, out, ): """ Compile one PDF or JSON (from Markdown), unencrypted. The exam may be deployed or local (in Markdown or JSON). If a seed is specified, it will scramble the exam. """ if not out: out = "" pathlib.Path(out).mkdir(parents=True, exist_ok=True) if json: print("Loading exam...") exam_data = load(json) elif md: exam_text_data = md.read() if merged_md: buff = LineBuffer(exam_text_data) handle_imports(buff, path=os.path.dirname(md.name)) merged_md.write("\n".join(buff.lines)) return print("Compiling exam...") exam_data = convert(exam_text_data, path=os.path.dirname(md.name), draft=draft) else: print("Fetching exam...") exam_data = get_exam(exam=exam) if seed: print("Scrambling exam...") exam_data = scramble(seed, exam_data, keep_data=with_solutions) def remove_solutions_from_groups(groups): for group in groups: # if isinstance(group, dict): group.pop("solution", None) if group.get("type") == "group": remove_solutions_from_groups(group.get("elements", [])) if not seed and not with_solutions: print("Removing solutions...") groups = exam_data.get("groups", []) remove_solutions_from_groups(groups) if json_out: print("Dumping json...") dump(exam_data, json_out, indent=4, sort_keys=True) return print("Rendering exam...") settings = { "coursecode": prettify(exam.split("-")[0]), "description": subtitle, "examtype": exam_type, "semester": semester, } if seed: settings["emailaddress"] = sanitize_email(seed) with render_latex(exam_data, settings) as pdf: pdf = Pdf.open(BytesIO(pdf)) pdf.save(os.path.join(out, exam + ".pdf")) pdf.close()
def test_pypdf2_issue_361(private): with gzip.open(str(private / 'pypdf2_issue_361.pdf.gz'), 'rb') as gz: with pytest.raises(PdfError, match=r'trailer'): Pdf.open(gz)
import pandas as pd import numpy as np import PyPDF2 import textract import re from pikepdf import Pdf filename ='cba_2020_annual_report.pdf' pdfFileObj = open(filename,'rb') #open allows you to read the file pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #The pdfReader variable is a readable object that will be parsed if pdfReader.isEncrypted: with Pdf.open(filename,password="******") as pdf: pdf.save("new"+filename) filename = "new"+filename pdfFileObj = open(filename,'rb') #open allows you to read the file pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #The pdfReader variable is a readable object that will be parsed #pdfReader.decrypt('') #num_pages = pdfReader.numPages #discerning the number of pages will allow us to parse through all the pages num_pages = 58 print(pdfReader.numPages) count = 55 text = "" while count < num_pages: #The while loop will read each page pageObj = pdfReader.getPage(count) count +=1 text += pageObj.extractText() #Below if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
def test_empty(outdir): target = outdir / 'empty.pdf' target.touch() with pytest.raises(PdfError): Pdf.open(target)
def graph(resources): # Has XMP and docinfo, all standard format XMP return Pdf.open(resources / 'graph.pdf')
def test_open_pdf_wrong_password(self, resources): # The correct passwords are "owner" and "user" with pytest.raises(PasswordError): Pdf.open(resources / 'graph-encrypted.pdf', password='******')
def trivial(resources): # Has no XMP or docinfo return Pdf.open(resources / 'pal-1bit-trivial.pdf')
def test_open_pdf_no_password_but_needed(self, resources): with pytest.raises(PasswordError): Pdf.open(resources / 'graph-encrypted.pdf')
def invalid_creationdate(resources): # Has nuls in docinfo, old PDF return Pdf.open(resources / 'invalid_creationdate.pdf')
def test_no_text_stream(self, resources): with pytest.raises(TypeError): with (resources / 'pal-1bit-trivial.pdf').open('r') as stream: Pdf.open(stream)
def test_memory(self, resources): pdf = (resources / 'pal-1bit-trivial.pdf').read_bytes() with pytest.raises(Exception): pdf = Pdf.open(pdf)
def test_attr_access(resources): pdf = Pdf.open(resources / 'graph.pdf') assert int(pdf.root.Pages.Count) == 1
def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf')
def test_overwrite_input(resources, outdir): copy(resources / 'sandwich.pdf', outdir / 'sandwich.pdf') with Pdf.open(outdir / 'sandwich.pdf') as p: with pytest.raises(ValueError, match=r'overwrite input file'): p.save(outdir / 'sandwich.pdf')
def test_non_filename(): with pytest.raises(TypeError): Pdf.open(42)
def vera(resources): # Has XMP but no docinfo return Pdf.open(resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf')
def test_not_existing_file(): with pytest.raises(FileNotFoundError): Pdf.open('does_not_exist.pdf')
def test_open_pdf(resources): pdf = Pdf.open(resources / 'graph.pdf') page = pdf.pages[0] Object._parse_stream(page, PrintParser())
def enron1(resources): # Has nuls in docinfo, old PDF return Pdf.open(resources / 'enron1_gs.pdf')