def printBT(): lblprintBTN.config(text="Working!") files = filedialog.askopenfiles() p = str(files[0].name).replace("/", "\\").rsplit('\\', 1)[0] os.chdir(p) for x in files: path = str(x.name).replace("/", "\\") print(path) with Pdf.open(path) as pd: length = len(pd.pages) name = path.split("\\")[-1] pdsplit = [] pdsplit2 = [] for n, page in enumerate(pd.pages): if n % 2 == 0: pdsplit.append(page) else: pdsplit2.append(page) i = 0 pdfsplit1 = Pdf.new() pdfsplit2 = Pdf.new() for n, x in enumerate(pdsplit): pdfsplit1.pages.append(x) for n, x in enumerate(pdsplit2): pdfsplit2.pages.append(x) pdfsplit1.save(name.replace(".pdf", "xSplit1.pdf")) pdfsplit2.save(name.replace(".pdf", "xSplit2.pdf")) lblprintBTN.config(text="Done!")
def test_page_labels(): p = Pdf.new() d = Dictionary(Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary()) for n in range(5): p.pages.append(d) p.pages[n].Contents = Stream(p, b"BT (Page %s) Tj ET" % str(n).encode()) p.Root.PageLabels = p.make_indirect( Dictionary( Nums=Array( [ 0, # new label rules begin at index 0 Dictionary(S=Name.r), # use lowercase roman numerals, until... 2, # new label rules begin at index 2 Dictionary( S=Name.D, St=42, P='Prefix-' ), # label pages as 'Prefix-42', 'Prefix-43', ... ] ) ) ) labels = ['i', 'ii', 'Prefix-42', 'Prefix-43', 'Prefix-44'] for n in range(5): rawpage = p.pages[n] page = Page(rawpage) assert page.label == labels[n]
def break_to_small_pdf_paths_original(pdf_path, output_directory=None, start_page=1, end_page=None, small_pdf_pages=25): # logging.info("Splitting %s into segments of %d", pdf_path, 25) pdf_name_stem = Path(pdf_path).stem if output_directory == None: output_directory = os.path.join( os.path.dirname(pdf_path), Path(pdf_path).stem + "_small_originals") # noinspection PyArgumentList with Pdf.open(pdf_path) as pdf: if end_page == None: end_page = len(pdf.pages) pages = range(start_page, end_page + 1) page_sets = list_helper.divide_chunks(list_in=pages, n=small_pdf_pages) dest_pdfs = [] for page_set in page_sets: pages = [pdf.pages[i - 1] for i in page_set] dest_pdf_path = os.path.join( output_directory, "%s_%04d-%04d.pdf" % (pdf_name_stem, page_set[0], page_set[-1]), ) if not os.path.exists(dest_pdf_path): # noinspection PyArgumentList dest_pdf = Pdf.new() dest_pdf.pages.extend(pages) os.makedirs(os.path.dirname(dest_pdf_path), exist_ok=True) dest_pdf.save(filename_or_stream=dest_pdf_path) else: logging.warning("%s exists", dest_pdf_path) dest_pdfs.append(dest_pdf_path) return dest_pdfs
def split_into_small_pdfs(pdf_path, output_directory=None, start_page=1, end_page=None, small_pdf_pages=25): pdf_name_stem = Path(pdf_path).stem if output_directory == None: output_directory = _get_ocr_dir(pdf_path) # noinspection PyArgumentList with Pdf.open(pdf_path) as pdf: if end_page == None: end_page = len(pdf.pages) pages = range(start_page, end_page + 1) page_sets = list_helper.divide_chunks(list_in=pages, n=small_pdf_pages) for page_set in page_sets: pages = [pdf.pages[i - 1] for i in page_set] dest_pdf_path = os.path.join( output_directory, "%s_%04d-%04d.pdf" % (pdf_name_stem, page_set[0], page_set[-1])) if not os.path.exists(dest_pdf_path): # noinspection PyArgumentList dest_pdf = Pdf.new() dest_pdf.pages.extend(pages) os.makedirs(os.path.dirname(dest_pdf_path), exist_ok=True) dest_pdf.save(filename=dest_pdf_path) else: logging.warning("%s exists", dest_pdf_path)
def unlockPdf(filepath): PSWD = os.path.basename(filepath)[0:-4] pdffile = Pdf.open(filepath,password=PSWD) newPdf = Pdf.new() newPdf.pages.extend(pdffile.pages) OUTPUT_DIR = os.path.dirname(filepath) newPdf.save(OUTPUT_DIR + '/decrypted.pdf')
def split_pdf(stream, opcode, data): pdf = stream if opcode == 0: new_pdf = Pdf.new() new_pdf.pages.append(pdf.pages[data["number"]]) new_pdf.save(str(data["number"]) + '.pdf') elif opcode == 1: new_pdf = Pdf.new() for n, page in enumerate(pdf.pages): if n >= data["start"]: new_pdf.pages.append(page) if n == data["end"]: break new_pdf.author() new_pdf.save('output.pdf') pdf.close()
def pdfs(): with Pdf.open(resources / "content-stream-errors.pdf") as pdf, Pdf.new() as output: part = 1 for _idx, page in enumerate(pdf.pages): if len(output.pages) == 2: part_file = tmp_path / f"part-{part}.pdf" output.save(part_file) yield part_file output = Pdf.new() part += 1 output.pages.append(page) if len(output.pages) > 0: part_file = tmp_path / f"part-{part}.pdf" output.save(part_file) yield part_file
def concatenate(n): output_pdf = Pdf.new() for i in range(n): print(i) pdf_page = Pdf.open(resources / 'pal.pdf') output_pdf.pages.extend(pdf_page.pages) output_pdf.save(outdir / f'{n}.pdf')
def to_image_unlock(filepath, pw): data = Pdf.open(filepath, password=pw) newPdf = Pdf.new() newPdf.pages.extend(data.pages) newPdf.save(filepath) return to_image(filepath)
def test_split_pdf(fourpages, outdir): for n, page in enumerate(fourpages.pages): outpdf = Pdf.new() outpdf.pages.append(page) outpdf.save(outdir / f"page{n + 1}.pdf") assert len([f for f in outdir.iterdir() if f.name.startswith('page')]) == 4
def test_push_stack(fourpages, outpdf): pdf = Pdf.new() pdf.add_blank_page(page_size=(1000, 1000)) page = pdf.pages[0] pdf.pages.extend(fourpages.pages) page.Contents = pdf.make_stream( b"0.4 G\n" b"0 500 500 1000 re s\n" b"500 500 1000 1000 re s\n" b"-1 0 0 1 500 0 cm\n" ) xobj1 = page.add_overlay( pdf.pages[1], Rectangle(0, 500, 500, 1000), push_stack=False ) xobj2 = page.add_overlay( pdf.pages[2], Rectangle(500, 500, 1000, 1000), push_stack=True ) draw_events = _simple_interpret_content_stream(page) # First page should be mirrored horizontally since stack was not pushed xobj, ctm = next(draw_events) assert xobj == xobj1 assert ctm.a < 0 and ctm.d > 0, "Not horizontally mirrored as expected" # Second page should be in upper right corner, properly positioned for a 4-up xobj, ctm = next(draw_events) assert xobj == xobj2 assert ctm.e >= 500 and ctm.f >= 500 # Test requires visual confirmation del pdf.pages[1:] pdf.save(outpdf)
def test_add_foreign_twice(graph, outpdf): out = Pdf.new() out.pages.append(out.copy_foreign(graph.pages[0])) assert len(out.pages) == 1 out.pages.append(out.copy_foreign(graph.pages[0])) assert len(out.pages) == 2 out.save(outpdf)
def img2pdfBT(): lblimgtopdfBTN.config(text="Working!") path = filedialog.askdirectory() mname = simpledialog.askstring(title="Name", prompt="Enter prefix name.") for root, dirs, files in os.walk(path): lst = [] for x in files: if x.endswith((".jpg", ".png")): print(root + "/" + x) image = PIL.Image.open(str(root + "/" + x).replace("/", "\\")) pdf_bytes = img2pdf.convert(image.filename) f = io.BytesIO(pdf_bytes) lst.append(f) image.close() pdf = Pdf.new() for y in lst: with Pdf.open(y) as pd: pdf.pages.extend(pd.pages) rootname = str(root).replace("/", "\\") nameoffile = str(rootname + "_" + str(mname) + ".pdf") pdf.save(nameoffile) lblimgtopdfBTN.config(text="Done!")
def pdfs(): pdf = Pdf.open(resources / "content-stream-errors.pdf") output = Pdf.new() part = 1 for _idx, page in enumerate(pdf.pages): if len(output.pages) == 2: part_file = tmp_path / "part-{0}.pdf".format(part) output.save(part_file) yield part_file output = Pdf.new() part += 1 output.pages.append(page) if len(output.pages) > 0: part_file = tmp_path / "part-{0}.pdf".format(part) output.save(part_file) yield part_file output.close()
def concatenate(n): print('concatenating same page', n, 'times') output_pdf = Pdf.new() for i in range(n): print(i) pdf_page = Pdf.open(resources / 'pal.pdf') output_pdf.pages.extend(pdf_page.pages) output_pdf.save(outdir / '{}.pdf'.format(n))
def mege_pdf(files_m): pdf_out = Pdf.new() out_p_name = files_m[0].replace(".pdf", "Combined.pdf") for name in files_m: src = Pdf.open(name) pdf_out.pages.extend(src.pages) pdf_out.save(out_p_name) print("Combined pdfs saved to : " + out_p_name)
def extract_pages(from_s, to_s): out_pdf = Pdf.new() for s in range(from_s, to_s): print(s) # out_pdf.pages.append( sample_pdf.pages[s]) out_pdf.save("Extracted_PAGES.pdf")
def get_path_value(self): #print(self.path_value.get()) self.path = self.path_value.get() self.folder_name = self.path.rsplit("\\",1)[1] #print(self.folder_name) #self.directory = self.path for folder_entry in os.scandir(self.path): #directory): self.folder_name = Path(folder_entry).stem #print(folder_entry.path) mergedObject = PdfFileMerger() for file_entry in os.scandir(folder_entry): self.file_name = Path(file_entry).stem print("Filenname", self.file_name) if not (file_entry.path.endswith(".jpg") or file_entry.path.endswith(".jpeg") or file_entry.path.endswith(".png") #or file_entry.path.endswith(".doc") or file_entry.path.endswith(".docx") or file_entry.path.endswith(".pdf") )and file_entry.is_file(): print("Was ist das? Konvertier ich nicht " + file_entry.path) #if (file_entry.path.endswith("") and file_entry.is_file()): # print("nothing") if (file_entry.path.endswith(".jpg") or file_entry.path.endswith(".jpeg") or file_entry.path.endswith(".png")) and file_entry.is_file(): #print(file_entry.path) image = Image.open(file_entry.path) i = image.convert('RGB') i.save(os.path.splitext(file_entry.path)[0] + ".pdf") # if (file_entry.path.endswith(".doc") ) and file_entry.is_file(): # convert(file_entry) if (file_entry.path.endswith(".docx")) and file_entry.is_file(): convert(file_entry) for file_entry in os.scandir(folder_entry): if (file_entry.path.endswith(".pdf") and file_entry.is_file()): #print(file_entry.path + ".pdf") new_pdf = Pdf.new() with Pdf.open(file_entry.path, allow_overwriting_input=True) as pdf: pdf.save(file_entry.path) mergedObject.append(PdfFileReader(file_entry.path, "rb")) mergedObject.write(self.path + "\\" + Path(folder_entry).stem + " - " +Path(self.path).stem + ".pdf")
def test_image_roundtrip(outdir, w, h, pixeldata, cs, bpc): pdf = Pdf.new() image_data = pixeldata * (w * h) image = Stream(pdf, image_data) image.Type = Name('/XObject') image.Subtype = Name('/Image') image.ColorSpace = Name(cs) image.BitsPerComponent = bpc image.Width = w image.Height = h xobj = {'/Im1': image} resources = {'/XObject': xobj} mediabox = [0, 0, 100, 100] stream = b'q 100 0 0 100 0 0 cm /Im1 Do Q' contents = Stream(pdf, stream) page_dict = { '/Type': Name('/Page'), '/MediaBox': mediabox, '/Contents': contents, '/Resources': resources, } page = pdf.make_indirect(page_dict) pdf.pages.append(page) outfile = outdir / f'test{w}{h}{cs[1:]}{bpc}.pdf' pdf.save( outfile, compress_streams=False, stream_decode_level=StreamDecodeLevel.none ) with Pdf.open(outfile) as p2: pim = PdfImage(p2.pages[0].Resources.XObject['/Im1']) assert pim.bits_per_component == bpc assert pim.colorspace == cs assert pim.width == w assert pim.height == h if cs == '/DeviceRGB': assert pim.mode == 'RGB' elif cs == '/DeviceGray' and bpc == 8: assert pim.mode == 'L' elif cs == '/DeviceCMYK': assert pim.mode == 'CMYK' elif bpc == 1: assert pim.mode == '1' assert not pim.palette assert pim.filters == [] assert pim.read_bytes() == pixeldata outstream = BytesIO() pim.extract_to(stream=outstream) outstream.seek(0) im = Image.open(outstream) assert pim.mode == im.mode
def test_split_pdf(resources, outdir): q = Pdf.open(resources / "fourpages.pdf") for n, page in enumerate(q.pages): outpdf = Pdf.new() outpdf.pages.append(page) outpdf.save(outdir / "page{}.pdf".format(n + 1)) assert len([f for f in outdir.iterdir() if f.name.startswith('page')]) == 4
def export_page(self, page_idx): """Helper function that exports a single page given by index """ page = self.reader.pages[page_idx] writer = Pdf.new() writer.pages.append(page) tmpfname = "./page.pdf" writer.save(tmpfname) writer.close() return tmpfname
def test_create_pdf(outdir): pdf = Pdf.new() font = pdf.make_indirect( Object.parse(b""" << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >>""")) width, height = 100, 100 image_data = b"\xff\x7f\x00" * (width * height) image = Stream(pdf, image_data) image.stream_dict = Object.parse(b""" << /Type /XObject /Subtype /Image /ColorSpace /DeviceRGB /BitsPerComponent 8 /Width 100 /Height 100 >>""") rfont = {'/F1': font} xobj = {'/Im1': image} resources = { '/Font': rfont, '/XObject': xobj } mediabox = [0, 0, 612, 792] stream = b""" BT /F1 24 Tf 72 720 Td (Hi there) Tj ET q 144 0 0 144 234 324 cm /Im1 Do Q """ contents = Stream(pdf, stream) page_dict = { '/Type': Name('/Page'), '/MediaBox': mediabox, '/Contents': contents, '/Resources': resources } qpdf_page_dict = page_dict page = pdf.make_indirect(qpdf_page_dict) pdf.pages.append(page) pdf.save(outdir / 'hi.pdf')
def test_issue_271(): f1 = Pdf.new() f2 = Pdf.new() p1 = f1.add_blank_page() # copy p1 to f2 and change its mediabox f2.pages.append(p1) p2 = f2.pages[0] p2.MediaBox[0] = 1 p2.Rotate = 1 f2.pages.append(p1) p3 = f2.pages[1] assert p2.MediaBox[0] != p1.MediaBox[0] assert Name.Rotate in p2 and Name.Rotate not in p1 assert p3.MediaBox[0] == p1.MediaBox[0] assert Name.Rotate not in p3
def test_save_bytesio(resources, outpdf): with Pdf.open(resources / 'fourpages.pdf') as input_: pdf = Pdf.new() for page in input_.pages: pdf.pages.append(page) bio = BytesIO() pdf.save(bio) bio_value = bio.getvalue() assert bio_value != b'' pdf.save(outpdf) assert outpdf.read_bytes() == bio_value
def pdfConverter(): url = request.form['url'] endurl = request.form['end'] print(endurl) end = endurl.replace("https://www.javatpoint.com/", "") print(end) print(url) print("Check") try: client = pdfcrowd.HtmlToPdfClient('demo', 'ce544b6ea52a5621fb9d55f8b542d14d') client.convertUrlToFile(url, 'static/pdf/1.pdf') print("1") counter = 2 noti = True while (noti == True): res = requests.get(url) soup = bs4.BeautifulSoup(res.text, "lxml") data = soup.select(".next", href=True) nextpoint = data[0]['href'] print(nextpoint) url = "https://www.javatpoint.com/" + nextpoint if nextpoint != end: try: client.convertUrlToFile(url, 'static/pdf/' + f'{counter}.pdf') print(counter) counter += 1 except pdfcrowd.Error as why: sys.stderr.write('Pdfcrowd Error: {}\n'.format(why)) raise else: noti = False filepath_list = os.listdir('static/pdf/') print(filepath_list) pdf = Pdf.new() for file in filepath_list: if file.endswith('.pdf'): print(file) src = Pdf.open('static/pdf/' + file) print("@#$") pdf.pages.extend(src.pages) src = Pdf.open('static/final_pdf/blank.pdf') pdf.pages.extend(src.pages) pdf.save('static/final_pdf/merged.pdf') for file in filepath_list: if file.endswith('.pdf'): os.remove('static/pdf/' + file) except Exception as e: print(e) return render_template('download.html')
def test_foreign_copied_pages_are_true_copies(graph, outpdf): out = Pdf.new() for n in range(4): out.pages.append(out.copy_foreign(graph.pages[0])) for n in [0, 2]: out.pages[n].Rotate = 180 out.save(outpdf) reopened = Pdf.open(outpdf) assert reopened.pages[0].Rotate == 180 assert reopened.pages[1].get(Name.Rotate, 0) == 0
def test_repeat_using_intermediate(graph, outpdf): def _repeat_page(pdf_in, page, count, pdf_out): for _duplicate in range(count): pdf_new = Pdf.new() pdf_new.pages.append(pdf_in.pages[page]) pdf_out.pages.extend(pdf_new.pages) return pdf_out with Pdf.new() as out: _repeat_page(graph, 0, 3, out) assert len(out.pages) == 3 out.save(outpdf)
def preprocess_pdf(fname='temp.pdf'): from pikepdf import Pdf tmp_output_file_path = fname+'.tmp' final_input_file_path = fname+'.tmp' pdf = Pdf.open(fname) new_pdf = Pdf.new() for page_obj in pdf.pages: new_pdf.pages.append(page_obj) new_pdf.save(tmp_output_file_path) rename(fname, final_input_file_path) rename(tmp_output_file_path, fname) print(f"Fixed {fname}")
def juntar_merge_dois_pdfs(): pdf = Pdf.new() fonte1 = Pdf.open('PDF_Exemplo2.pdf') fonte2 = Pdf.open('PDF_Exemplo3.pdf') pdf.pages.extend(fonte1.pages) pdf.pages.extend(fonte2.pages) pdf.save('pdf_combinado.pdf') fonte1.close() fonte2.close() pdf.close()
def merge_files(input_path, list_files, output_file): """ this function will merge pdf files :param string, list, string: :return: """ pdf = Pdf.new() version = pdf.pdf_version for _file in list_files: src = Pdf.open(input_path + _file) pdf.pages.extend(src.pages) pdf.save(output_file, min_version=version)