예제 #1
0
def test_read_empty():
    with pytest.raises(PdfReadError) as exc:
        PdfReader(io.BytesIO())
    assert exc.value.args[0] == "Cannot read an empty file"
예제 #2
0
def test_get_page_number(src, page_nb):
    src = os.path.join(RESOURCE_ROOT, src)
    reader = PdfReader(src)
    page = reader.pages[page_nb]
    assert reader.get_page_number(page) == page_nb
예제 #3
0
def test_get_page_mode(src, expected):
    src = os.path.join(RESOURCE_ROOT, src)
    reader = PdfReader(src)
    assert reader.page_mode == expected
예제 #4
0
def test_get_outlines(src, outline_elements):
    reader = PdfReader(src)
    outlines = reader._get_outlines()
    assert len(outlines) == outline_elements
예제 #5
0
def test_get_num_pages(src, num_pages):
    src = os.path.join(RESOURCE_ROOT, src)
    reader = PdfReader(src)
    assert len(reader.pages) == num_pages
예제 #6
0
def test_extract_text(url, name):
    data = BytesIO(get_pdf_from_url(url, name=name))
    reader = PdfReader(data)
    reader.metadata
예제 #7
0
def test_rotate(degree):
    with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
        reader = PdfReader(inputfile)
        page = reader.pages[0]
        page.rotate(degree)
예제 #8
0
def test_get_metadata(url, name):
    data = BytesIO(get_pdf_from_url(url, name=name))
    reader = PdfReader(data)
    reader.metadata
예제 #9
0
파일: pdf_utils.py 프로젝트: pachi/visorxml
def render_to_pdf(html, filename, xml_filename, env={}):
    "Render html file to pdf using the given filename and environment. Embed xml data if available"
    debug = settings.DEBUG
    debug = False
    if debug:
        return HttpResponse(html)

    fd_html, filename_html = tempfile.mkstemp()
    fd_pdf, filename_pdf = tempfile.mkstemp(suffix=".pdf")
    _, filename_pdf2 = tempfile.mkstemp(suffix=".pdf")
    os.close(fd_pdf)
    try:
        with open(fd_html, 'wb') as f:
            f.write(html.encode('utf8'))
        path = os.path.join(os.path.dirname(__file__), '..', 'webkit',
                            'webkit2pdf')

        try:
            if not debug:
                env['DISPLAY'] = ':1'
                # Fake empty SSL Confif file to be able to run phantomjs in Buster
                env['OPENSSL_CONF'] = os.path.abspath(
                    os.path.join(os.path.dirname(__file__), '..', 'webkit',
                                 'openssl.cnf'))
        except KeyError:
            pass

        env.update(dict(os.environ))  # keep OS env vars, such as PATH

        cmd = [
            path, "-f", filename_html, "-o", filename_pdf, "--mediaroot",
            settings.MEDIA_ROOT, "--staticroot", settings.STATIC_ROOT,
            "--scriptname", settings.FORCE_SCRIPT_NAME or ''
        ]
        proc = subprocess.Popen(cmd, env=env)

        while True:
            proc.poll()
            if proc.returncode is not None:
                break

    except Exception as error:
        print(error)
    finally:
        os.remove(filename_html)

    if xml_filename:
        with open(filename_pdf, 'rb') as pdf:
            reader = PdfReader(pdf, strict=False)
            writer = PdfWriter()
            writer.appendPagesFromReader(reader)
            with open(xml_filename, "rb") as xml:
                writer.addAttachment("certificado.xml", xml.read())
                with open(filename_pdf2, "wb") as out:
                    writer.write(out)
                    out.close()
                pdf.close()

        with open(filename_pdf2, 'rb') as pdf:
            response = HttpResponse(pdf.read(), content_type='application/pdf')
            response[
                'Content-Disposition'] = 'attachment;filename=%s' % filename

            pdf.close()

            os.remove(filename_pdf)
            os.remove(filename_pdf2)
            return response

    else:
        with open(filename_pdf, 'rb') as pdf:
            response = HttpResponse(pdf.read(), content_type='application/pdf')
            response[
                'Content-Disposition'] = 'attachment;filename=%s' % filename

            pdf.close()

            os.remove(filename_pdf)
            os.remove(filename_pdf2)
            return response
예제 #10
0
파일: bench.py 프로젝트: mstamy2/PyPDF2
def text_extraction(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text
from PyPDF2 import PdfFileReader as PdfReader, PdfFileWriter as PdfWriter

pdf_obj = open('Ch13/meetingminutes.pdf', 'rb')
pdf_reader = PdfReader(pdf_obj)
print(pdf_reader.numPages) # Output: 19

page_obj = pdf_reader.getPage(0)
print(page_obj.extractText())
예제 #12
0
def convert_pdf_to_jpeg(path: str,
                        max_pages: str,
                        password: str,
                        horizontal: bool = False):
    """
    Converts a PDF file into a jpeg image
    :param path: file's path
    :param max_pages: max pages to render,
    :param password: PDF password
    :param horizontal: if True, will combine the pages horizontally
    :return: A list of stream of combined images
    """
    demisto.debug(f'Loading file at Path: {path}')
    input_pdf = PdfReader(open(path, "rb"), strict=False)
    pages = len(input_pdf.pages) if max_pages == "*" else min(
        int(max_pages), len(input_pdf.pages))

    with tempfile.TemporaryDirectory() as output_folder:
        demisto.debug('Converting PDF')
        convert_from_path(pdf_path=path,
                          fmt='jpeg',
                          first_page=1,
                          last_page=pages,
                          output_folder=output_folder,
                          userpw=password,
                          output_file='converted_pdf_')
        demisto.debug('Converting PDF - COMPLETED')

        demisto.debug('Combining all pages')
        images = []
        for page in sorted(os.listdir(output_folder)):
            if os.path.isfile(os.path.join(output_folder,
                                           page)) and 'converted_pdf_' in page:
                images.append(Image.open(os.path.join(output_folder, page)))
        min_shape = min([(np.sum(page_.size), page_.size)
                         for page_ in images])[1]  # get the minimal width

        # Divide the list of images into separate lists with constant length (20),
        # due to the limitation of images in jpeg format (max size ~65,000 pixels).
        # Create a list of lists (length == 20) of images to combine each list (20 images) to one image
        images_matrix = [
            images[i:i + PAGES_LIMITATION]
            for i in range(0, len(images), PAGES_LIMITATION)
        ]

        outputs = []
        for images_list in images_matrix:
            if horizontal:
                imgs_comb = np.hstack([
                    np.asarray(image.resize(min_shape))
                    for image in images_list
                ])
            else:
                imgs_comb = np.vstack([
                    np.asarray(image.resize(min_shape))
                    for image in images_list
                ])

            imgs_comb = Image.fromarray(imgs_comb)
            output = BytesIO()
            imgs_comb.save(output, 'JPEG')  # type: ignore
            demisto.debug('Combining all pages - COMPLETED')
            outputs.append(output.getvalue())

        return outputs