def test_read_empty(): with pytest.raises(PdfReadError) as exc: PdfReader(io.BytesIO()) assert exc.value.args[0] == "Cannot read an empty file"
def test_get_page_number(src, page_nb): src = os.path.join(RESOURCE_ROOT, src) reader = PdfReader(src) page = reader.pages[page_nb] assert reader.get_page_number(page) == page_nb
def test_get_page_mode(src, expected): src = os.path.join(RESOURCE_ROOT, src) reader = PdfReader(src) assert reader.page_mode == expected
def test_get_outlines(src, outline_elements): reader = PdfReader(src) outlines = reader._get_outlines() assert len(outlines) == outline_elements
def test_get_num_pages(src, num_pages): src = os.path.join(RESOURCE_ROOT, src) reader = PdfReader(src) assert len(reader.pages) == num_pages
def test_extract_text(url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) reader.metadata
def test_rotate(degree): with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile: reader = PdfReader(inputfile) page = reader.pages[0] page.rotate(degree)
def test_get_metadata(url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) reader.metadata
def render_to_pdf(html, filename, xml_filename, env={}): "Render html file to pdf using the given filename and environment. Embed xml data if available" debug = settings.DEBUG debug = False if debug: return HttpResponse(html) fd_html, filename_html = tempfile.mkstemp() fd_pdf, filename_pdf = tempfile.mkstemp(suffix=".pdf") _, filename_pdf2 = tempfile.mkstemp(suffix=".pdf") os.close(fd_pdf) try: with open(fd_html, 'wb') as f: f.write(html.encode('utf8')) path = os.path.join(os.path.dirname(__file__), '..', 'webkit', 'webkit2pdf') try: if not debug: env['DISPLAY'] = ':1' # Fake empty SSL Confif file to be able to run phantomjs in Buster env['OPENSSL_CONF'] = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', 'webkit', 'openssl.cnf')) except KeyError: pass env.update(dict(os.environ)) # keep OS env vars, such as PATH cmd = [ path, "-f", filename_html, "-o", filename_pdf, "--mediaroot", settings.MEDIA_ROOT, "--staticroot", settings.STATIC_ROOT, "--scriptname", settings.FORCE_SCRIPT_NAME or '' ] proc = subprocess.Popen(cmd, env=env) while True: proc.poll() if proc.returncode is not None: break except Exception as error: print(error) finally: os.remove(filename_html) if xml_filename: with open(filename_pdf, 'rb') as pdf: reader = PdfReader(pdf, strict=False) writer = PdfWriter() writer.appendPagesFromReader(reader) with open(xml_filename, "rb") as xml: writer.addAttachment("certificado.xml", xml.read()) with open(filename_pdf2, "wb") as out: writer.write(out) out.close() pdf.close() with open(filename_pdf2, 'rb') as pdf: response = HttpResponse(pdf.read(), content_type='application/pdf') response[ 'Content-Disposition'] = 'attachment;filename=%s' % filename pdf.close() os.remove(filename_pdf) os.remove(filename_pdf2) return response else: with open(filename_pdf, 'rb') as pdf: response = HttpResponse(pdf.read(), content_type='application/pdf') response[ 'Content-Disposition'] = 'attachment;filename=%s' % filename pdf.close() os.remove(filename_pdf) os.remove(filename_pdf2) return response
def text_extraction(pdf_path): reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() return text
from PyPDF2 import PdfFileReader as PdfReader, PdfFileWriter as PdfWriter pdf_obj = open('Ch13/meetingminutes.pdf', 'rb') pdf_reader = PdfReader(pdf_obj) print(pdf_reader.numPages) # Output: 19 page_obj = pdf_reader.getPage(0) print(page_obj.extractText())
def convert_pdf_to_jpeg(path: str, max_pages: str, password: str, horizontal: bool = False): """ Converts a PDF file into a jpeg image :param path: file's path :param max_pages: max pages to render, :param password: PDF password :param horizontal: if True, will combine the pages horizontally :return: A list of stream of combined images """ demisto.debug(f'Loading file at Path: {path}') input_pdf = PdfReader(open(path, "rb"), strict=False) pages = len(input_pdf.pages) if max_pages == "*" else min( int(max_pages), len(input_pdf.pages)) with tempfile.TemporaryDirectory() as output_folder: demisto.debug('Converting PDF') convert_from_path(pdf_path=path, fmt='jpeg', first_page=1, last_page=pages, output_folder=output_folder, userpw=password, output_file='converted_pdf_') demisto.debug('Converting PDF - COMPLETED') demisto.debug('Combining all pages') images = [] for page in sorted(os.listdir(output_folder)): if os.path.isfile(os.path.join(output_folder, page)) and 'converted_pdf_' in page: images.append(Image.open(os.path.join(output_folder, page))) min_shape = min([(np.sum(page_.size), page_.size) for page_ in images])[1] # get the minimal width # Divide the list of images into separate lists with constant length (20), # due to the limitation of images in jpeg format (max size ~65,000 pixels). # Create a list of lists (length == 20) of images to combine each list (20 images) to one image images_matrix = [ images[i:i + PAGES_LIMITATION] for i in range(0, len(images), PAGES_LIMITATION) ] outputs = [] for images_list in images_matrix: if horizontal: imgs_comb = np.hstack([ np.asarray(image.resize(min_shape)) for image in images_list ]) else: imgs_comb = np.vstack([ np.asarray(image.resize(min_shape)) for image in images_list ]) imgs_comb = Image.fromarray(imgs_comb) output = BytesIO() imgs_comb.save(output, 'JPEG') # type: ignore demisto.debug('Combining all pages - COMPLETED') outputs.append(output.getvalue()) return outputs