Exemple #1
0
 def merge(self, pdf_one, pdf_two, filename='my.pdf', output_dir='D:/pdf/'):
     '''
     function:#pdfone为扫描的正面;#pdftwo为扫描的背面;#本函数实现将两个扫描文件按原有的顺序合并起来
     :param pdf_one:
     :param pdf_two:
     :param filename:
     :param output_dir:
     :return:
     '''
     input_one = open(pdf_one, 'rb')
     input_two = open(pdf_two, 'rb')
     pdf_input_one = PdfFileReader(input_one)
     pdf_input_two = PdfFileReader(input_two)
     numOne = pdf_input_one.getNumPages()
     numTwo = pdf_input_two.getNumPages()
     print(numOne, numTwo)
     pdf_output = PdfFileWriter()
     index_one = 0
     index_two = numTwo - 1
     while True:
         if index_one == numOne: break
         print(index_one, index_two)
         page1 = pdf_input_one.getPage(index_one)
         pdf_output.addPage(page1)
         page2 = pdf_input_two.getPage(index_two)
         pdf_output.addPage(page2)
         index_one += 1
         index_two -= 1
     pdf_name = output_dir + filename
     output_stream = open(pdf_name, 'wb')
     pdf_output.write(output_stream)
     output_stream.close()
     input_one.close()
     input_two.close()
     print('Done!')
Exemple #2
0
def PdfPrettyPrint(inputname, outputname):
    inputfile = open(inputname, 'rb')
    wrt = PdfFileWriter()
    ipt = PdfFileReader(inputfile)
    #print ipt.getDocumentInfo()
    pdfnums = ipt.getNumPages()
    #print pdfnums
    i = 0
    while i < pdfnums:
        page = ipt.getPage(i)
        wrt.addPage(page)
        if i + 2 < pdfnums:
            page = ipt.getPage(i + 2)
            wrt.addPage(page)
        else:
            wrt.addBlankPage()
        if i + 1 < pdfnums:
            page = ipt.getPage(i + 1)
            page.rotateClockwise(180)
            wrt.addPage(page)
        else:
            wrt.addBlankPage()
        if i + 3 < pdfnums:
            page = ipt.getPage(i + 3)
            page.rotateClockwise(180)
            wrt.addPage(page)
        else:
            wrt.addBlankPage()
        i = i + 4
    fl = open(outputname, "wb")
    wrt.write(fl)
    inputfile.close()
    fl.close()
    return True
def generate_a_pdf(filename, num_pages, dir=None):
    """function to generate a random PDF file of N pages with single image per page

    taken from https://stackoverflow.com/questions/2925484/place-image-over-pdf

    Args:
        filename (str): path to save the pdf file
        num_pages (int): number of pages to make the pdf file

    KWArgs:
        dir (str): the path to the directory to save the pdf file

    Returns:
        str. path to the new pdf file
    """
    pdf = PdfFileWriter()
    for num in range(1, num_pages+1):
        imgTemp = BytesIO()
        jpeg_path = make_a_jpeg('{}.jpeg'.format(str(num)), pick_a_color(num)) 
        imgDoc = canvas.Canvas(imgTemp, pagesize=A4)
        imgDoc.drawImage(jpeg_path, 25, 45)
        imgDoc.save()
        pdf.addPage(PdfFileReader(BytesIO(imgTemp.getvalue())).getPage(0))
        remove(jpeg_path)
    if dir:
        path = join(dir, filename)
    else:
        path = join(getcwd(), filename)
    pdf.write(open(path, 'wb'))
    return path
Exemple #4
0
def generate_images(path, save_dir_name, is_train):
    if not os.path.exists('png_files/'):
        os.mkdir('png_files/')
    train_images = 'train_images/'
    test_images = 'test_images/'
    if is_train:
        save_directory_path = 'png_files/'+ train_images + save_dir_name + '_annotated_images'
    else:
        save_directory_path = 'png_files/' + test_images + save_dir_name + '_annotated_images'

    if not os.path.exists(save_directory_path):
        os.makedirs(save_directory_path)
    filename = path
    print("Converting " + filename + " from pdf to PNG...")
    reader = PdfFileReader(open(filename, mode="rb"))
    try:
        page_number = reader.getNumPages()
    except:
        page_number = reader.getNumPages() #PyPDF2 bug
    with tempfile.TemporaryDirectory() as path:
        images_from_path = convert_from_path(filename, dpi=72, output_folder=path, last_page=page_number, first_page=0)
    i = 0
    for page in images_from_path:
        base_filename = os.path.splitext(os.path.basename(filename))[0] + '_' + str(i + 1) + '.png'
        page.save(os.path.join(save_directory_path, base_filename), 'PNG')
        i += 1

    print('PDF file successfully converted.')
Exemple #5
0
 def test_cat(self):
     """Make sure files are properly concatenated."""
     run_stapler(['cat', ONEPAGE_PDF, FIVEPAGE_PDF, self.outputfile])
     self.assertTrue(os.path.isfile(self.outputfile))
     with open(self.outputfile, 'rb') as outputfile:
         pdf = PdfFileReader(outputfile)
         self.assertEqual(pdf.getNumPages(), 6)
Exemple #6
0
    def translate(self):
        '''读取pdf内容,并翻译,写入txt文件'''
        f = open(self.fullPath, 'rb')
        pdf = PdfFileReader(f)

        index = 0
        for i in range(0, pdf.getNumPages()):
            extractedText = pdf.getPage(i).extractText()
            content = extractedText.split('\n')
            content = self.removeBlankFromList(content)

            # 拼接之后的文本,如果单词间歇超过一个空格的,认为是需要换行处理的
            content_list = self.enter_symbol(content)

            for line in content_list:
                line = line.strip()
                if line:
                    ret = translate_func(line)
                    trans = ret if ret else '翻译失败'
                    self.write(line + '\n')
                    self.write(trans)
                    index += 1
                    print(index, end=' ', flush=True)

        f.close()
        Logger().write(self.fileName + '翻译完成,新文档:' + self.new_fullPath)
Exemple #7
0
def metadata(path: Path) -> Metadata:
    """
    Reads a given PDF file and produces a Metadata object.

    :param path: path to a PDF file
    :return: the metadata extracted from the PDF file
    """
    with path.open('rb') as f:
        reader = PdfFileReader(f)
        info = reader.getDocumentInfo()
        page_count = reader.getNumPages()

    typer.echo(f'PDF metadata: {info}', err=True)

    # Decide which possible title to use:
    # - the title annotated in the PDF metadata
    # - the title read by pdftitle (largest text on the first page)
    # - the file name without extension
    pdftitle_title = pdftitle.get_title_from_file(str(path))
    typer.echo(f'Title according to pdftitle: {pdftitle_title}', err=True)

    title_candidates = [t for t in [info.title, pdftitle_title, path.stem] if t is not None]

    # The current heuristic is just to use the longest of the three candidates
    title = max(title_candidates, key=len)

    return Metadata(
        title=title,
        author=info.author,
        page_count=page_count
    )
Exemple #8
0
    def finalize_print_preparation(self):
        """Take the resulting multi page PDF and split into rotated single pages

        Taken from `pythonlibrary.org
        <https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-pdfs
        -with-python/>`_ in combination with `johndcook.com
        <https://www.johndcook.com/blog/2015/05/01/rotating-pdf-pages-with-python/>`_
        """

        pdf: PdfFileReader = PdfFileReader(self._full_output_path_)
        for page_number in range(pdf.getNumPages()):
            pdf_writer: PdfFileWriter = PdfFileWriter()
            page: PageObject = pdf.getPage(page_number)
            page.rotateCounterClockwise(90)
            pdf_writer.addPage(page)
            output_filename: str = (f"{self._output_base_filename}_page_"
                                    f"{str(page_number + 1).zfill(2)}.pdf")

            with open(
                    os.path.join(self._output_directory_name, output_filename),
                    "wb") as pdf_out:
                pdf_writer.write(pdf_out)

        path_to_pdf = os.path.join(os.getcwd(), self._full_output_path_)
        print(f"Create {pdf.getNumPages()} single paged PDFs.\n\n"
              f"You can find them concatenated at file://"
              f"{path_to_pdf}")
Exemple #9
0
    def _create_pdf_from_rtf_files(self):
        pdfs = []
        self.progress.emit(0)
        for count, file in enumerate(self.files):
            changed_file = change_filetype(file, "pdf", self.engine)
            pdfs.append(changed_file)
            self.progress.emit(count + 1)
        merger = PdfFileMerger()
        pages = []
        chapters = []
        for file in pdfs:
            read_pdf = PdfFileReader(file)
            txt = read_pdf.getPage(0)
            page_content = txt.extractText()
            try:
                chapter = helper_functions.get_chapter_from_pdf_txt(
                    page_content)
                chapters.append(chapter)
            except:
                chapter = os.path.basename(file)
                chapter = chapter.split(".")[0]
                chapter = chapter.replace("_", " ")
                chapters.append(chapter)

            pages.append(read_pdf.getNumPages())
            merger.append(fileobj=file)
        self.pages = pages
        self.chapters = chapters
        if not self.create_toc:
            merger.write(self.master_file_name)
        else:
            merger.write("tmp.pdf")
        merger.close()
        self.trash += pdfs
Exemple #10
0
 def test_zip(self):
     """Test zip."""
     run_stapler(['zip', ONEPAGE_PDF, FIVEPAGE_PDF, self.outputfile])
     self.assertTrue(os.path.isfile(self.outputfile))
     with open(self.outputfile, 'rb') as outputfile:
         pdf = PdfFileReader(outputfile)
         self.assertEqual(pdf.getNumPages(), 6)
Exemple #11
0
 def test_sel_range(self):
     """Test select of more pages from a PDF file."""
     run_stapler(['cat', 'A=' + FIVEPAGE_PDF, 'A2-4', self.outputfile])
     self.assertTrue(os.path.isfile(self.outputfile))
     with open(self.outputfile, 'rb') as outputfile:
         pdf = PdfFileReader(outputfile)
         self.assertEqual(pdf.getNumPages(), 3)
Exemple #12
0
 def test_del_range(self):
     """Test del command for inverse select multiple pages."""
     run_stapler(['del', 'A=' + FIVEPAGE_PDF, 'A2-4', self.outputfile])
     self.assertTrue(os.path.isfile(self.outputfile))
     with open(self.outputfile, 'rb') as outputfile:
         pdf = PdfFileReader(outputfile)
         self.assertEqual(pdf.getNumPages(), 2)
Exemple #13
0
def extract_from_file(file: IO[bytes], filename: str, mime_type: str,
                      file_id: int) -> Tuple[Optional[str], Optional[int]]:
    """ Returns the text and the page count """

    parsed_text = None
    page_count = None
    if mime_type == "application/pdf" or mime_type.startswith(
            "application/pdf;"):
        try:
            command = ["pdftotext", filename, "-"]
            completed = subprocess.run(command,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       check=True)
            parsed_text = completed.stdout.decode("utf-8", "ignore")
            if completed.stderr:
                logger.info("pdftotext: {}".format(completed.stderr))
        except CalledProcessError as e:
            logger.exception("File {}: Failed to run pdftotext: {}".format(
                file_id, e))

        try:
            page_count = PdfFileReader(file,
                                       strict=False,
                                       overwriteWarnings=False).getNumPages()
        except (PdfReadError, KeyError):
            message = "File {}: Pdf does not allow to read the number of pages".format(
                file_id)
            logger.warning(message)
    elif mime_type == "text/text":
        parsed_text = file.read()
    else:
        logger.warning(
            f"File {file_id} has an unknown mime type: '{mime_type}'")
    return parsed_text, page_count
Exemple #14
0
def extract_pdf_pypdf2(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        if pdf.isEncrypted:
            pdf.decrypt('')
        page_obj = pdf.getPage(2)
        return page_obj.extractText()
def RemovePdfOwnerPassword(inputname, outputname):
    '''
    '''
    inputfile = open(inputname, 'rb')
    wrt = PdfFileWriter()
    ipt = PdfFileReader(inputfile)
    try:
        ipt.decrypt("")
    except KeyError as e:
        if e.message == '/Encrypt':
            print("%s is not an encrypted pdf" % inputname)
            return -1
        else:
            raise e
    print(ipt.getDocumentInfo())
    size = ipt.getNumPages()
    i = 0
    while i < size:
        page = ipt.getPage(i)
        #print(page.extractText())
        wrt.addPage(page)
        i = i + 1
    fl = open(outputname, "wb")
    wrt.write(fl)

    inputfile.close()
    fl.close()
    return 0
    def run(self):
        if self.beforeHandler(self._id, self.attachUrl):
            return
        filename = self.tempDir + str(random.random())
        filename1 = self.tempDir + str(random.random()) + '.pdf'
        try:
            urllib.request.urlretrieve(self.attachUrl, filename)
            input_stream = open(filename, 'rb')
            pdf_input = PdfFileReader(input_stream)
            pdf_output = PdfFileWriter()

            page = 0
            pages = pdf_input.getNumPages() - 1
            # remove last page
            while page < pages:
                pdf_output.addPage(pdf_input.getPage(page))
                page += 1

            output_stream = open(filename1, 'wb')
            pdf_output.write(output_stream)
            output_stream.close()
            input_stream.close()
            if self.success is not None:
                self.success(self._id, filename1)
        except Exception as e:
            if self.error is not None:
                self.error(e, self.attachUrl)
        finally:
            if os.path.exists(filename):
                os.remove(filename)
            if os.path.exists(filename1):
                os.remove(filename1)
Exemple #17
0
def readPDFfile(infile):
    pdf = PdfFileReader(infile, "rb"))
    content = ""
    num = pdf.getNumPages()
    for i in range(0, num):
        extractedText = pdf.getPage(i).extractText()
        content +=  extractedText + "\n"
    return content
def getDataUsingPyPdf2(filename):
    pdf = PdfFileReader(open(filename, "rb"))
    content = ""
    num = pdf.getNumPages()
    for i in range(0, num):
        extractedText = pdf.getPage(i).extractText()
        content += extractedText + "\n"
    return content
Exemple #19
0
def getPdffileBookmark(filename, bookmark_file_savepath):
    pdf = PdfFileReader(open(filename, "rb"))

    pagecount = pdf.getNumPages()
    print('pagecount:', pagecount)

    pageLabels = {
    }  #真实页码的索引 indirectRef  “{'/Type': '/Fit', '/Page': IndirectObject(7871, 0), '/Title': '封面'}”
    for i in range(pagecount):
        page = pdf.getPage(i)
        pageLabels[page.indirectRef.idnum] = i + 1
        # print(page.indirectRef.idnum,i+1)

    bookmark_file = codecs.open(bookmark_file_savepath, 'w', encoding='utf-8')
    title = []
    pagedir = []
    bookmark_jibie = []
    outlines = pdf.getOutlines()
    print(outlines)
    index = 0
    jibie = 0
    for outline in outlines:
        index += 1
        jibie = 0
        print(len(outline), outline)
        if type(outline) == PyPDF2.generic.Destination:
            # print('dict--------')
            # print(list(outline.keys()))
            # for x,j in enumerate(list(outline.keys())):
            #     print(str(outline[j]))
            # print(outline['/Title'])
            # print(outline['/Type'])
            # print(outline.page.idnum)
            bookmark_file.write(outline['/Title'] + '\t' +
                                str(pageLabels[outline.page.idnum]) + '\r\n')
        if type(outline) == list:
            # print('list')
            jibie = 1
            for i, outline in enumerate(outline):
                if type(outline) == PyPDF2.generic.Destination:
                    bookmark_file.write('\t' * jibie + outline['/Title'] +
                                        '\t' +
                                        str(pageLabels[outline.page.idnum]) +
                                        '\r\n')
                elif type(outline) == list:
                    jibie = 2
                    for i, o in enumerate(outline):
                        if type(outline) == PyPDF2.generic.Destination:
                            bookmark_file.write(
                                '\t' * jibie + outline['/Title'] + '\t' +
                                str(pageLabels[outline.page.idnum]) + '\r\n')

        # print('\n')
        # if index>=3:
        #     break
    bookmark_file.close()
Exemple #20
0
 def convertPDFAlternative(self, path):
     from PyPDF2.pdf import PdfFileReader
     if not os.path.exists(path):
         return False
     pdf = PdfFileReader(open(path, "rb"))
     for i in range(0, pdf.getNumPages()):
         print(i)
         extractedText = pdf.getPage(i).extractText()
         self.pages.append(extractedText)
     return True
def merge_pdf(file_list, output_path):
    '''合并 PDF'''
    outpdf = PdfFileWriter()
    for f in file_list:
        f_pdf = PdfFileReader(open(f, 'rb'))
        for page in f_pdf.pages:
            outpdf.addPage(page)
    ous = open(output_path, 'wb')
    outpdf.write(ous)
    ous.close()
Exemple #22
0
    def test_split(self):
        """Make sure a file is properly split into pages."""
        run_stapler(['split', FIVEPAGE_PDF])

        filelist = os.listdir(self.tmpdir)
        self.assertEqual(len(filelist), 5)
        for f in os.listdir(self.tmpdir):
            with open(os.path.join(self.tmpdir, f), 'rb') as pdf_file:
                pdf = PdfFileReader(pdf_file)
                self.assertEqual(pdf.getNumPages(), 1)
Exemple #23
0
 def get(self, request, *args, **kwargs):
     fontname_g = "HeiseiMin-W3"
     pdfmetrics.registerFont(UnicodeCIDFont(fontname_g))
     reader = PdfFileReader('media/pdf/riyuu-format4.pdf')
     writer = PdfFileWriter()
     buffer = io.BytesIO()
     cc = canvas.Canvas(buffer)
     cc.setFont(fontname_g, 11)
     initial = 295
     before_rect_x = 748
     after_rect_x = 776.5
     line_height = 11.9
     input_list = [{
         'label': '便器からの立ち座り',
         'before_flag': True,
         'after_flag': False
     }, {
         'label': 'トイレまでの移動',
         'before_flag': False,
         'after_flag': True
     }, {
         'label': 'トイレ出入口の出入(扉の開閉含む)',
         'before_flag': True,
         'after_flag': False
     }]
     welfare_equipment_material = PdfMaterial.objects.get(
         key="welfare_equipment")
     cc = self.motion_purpose_draw(cc, before_rect_x, after_rect_x,
                                   welfare_equipment_material.materials,
                                   input_list, initial, line_height)
     cc.showPage()
     cc.save()
     buffer.seek(0)
     new_pdf = PdfFileReader(buffer)
     existing_page = reader.getPage(0)
     existing_page.mergePage(new_pdf.getPage(0))
     writer.addPage(existing_page)
     new = io.BytesIO()
     writer.write(new)
     new.seek(0)
     print('finish')
     return FileResponse(new, as_attachment=True, filename='hello.pdf')
Exemple #24
0
def getDataUsingPyPdf2(filename):
    pdf = PdfFileReader(open(filename, "rb"))
    content = ""

    for i in range(0, pdf.getNumPages()):
        #print(str(i))
        extractedText = pdf.getPage(i).extractText()
        content += extractedText + "\n"

    content = " ".join(content.replace("\xa0", " ").strip().split())
    return content.encode("ascii", "ignore")
Exemple #25
0
 def _extract_pdf_forms(self, fname):
     """Extracts interactive form fields data from a PDF file.
     
     Parameters:
         fname (str): Path to PDF file.
     
     Returns:
         dict: Form fields data extracted.
     """
     f = PdfFileReader(fname)
     return f.getFields()
Exemple #26
0
 def get(self, request, *args, **kwargs):
     fontname_g = "HeiseiKakuGo-W5"
     pdfmetrics.registerFont(UnicodeCIDFont(fontname_g))
     buffer = io.BytesIO()
     cc = canvas.Canvas(buffer)
     reader = PdfFileReader('media/pdf/sample.pdf')
     existing_page = reader.getPage(0)
     cc.setFont(fontname_g, 24)
     cc.drawString(0, 820, "テスト")
     cc.showPage()
     cc.save()
     buffer.seek(0)
     new_pdf = PdfFileReader(buffer)
     existing_page.mergePage(new_pdf.getPage(0))
     writer = PdfFileWriter()
     writer.addPage(existing_page)
     new = io.BytesIO()
     writer.write(new)
     new.seek(0)
     return FileResponse(new, as_attachment=True, filename='hello.pdf')
Exemple #27
0
    def test_render_to_pdf_bytes(self):
        doc = self._get_docx('complex_fields')
        data = doc.render(context={
            'complex':
            'BEEES. AAAAH. BEEEEES',
            'complex2':
            'ಠ_ಠ unifying matrix conventions is the way of the future, Max.'
        },
                          format='pdf')

        # does pypdf like our output?
        PdfFileReader(BytesIO(data))
Exemple #28
0
def pdfSplit(pdf_main, pdf_part):
    try:
        pdf_read_obj = PdfFileReader(pdf_main)
        pdf_write_obj = PdfFileWriter()
        page_num = pdf_read_obj.getNumPages()
        page_last_obj = pdf_read_obj.getPage(page_num - 1)
        page_last_obj.rotateClockwise(90)
        pdf_write_obj.addPage(page_last_obj)
        pdf_write_obj.write(open(pdf_part, 'wb'))
        return page_num - 1
    except Exception as e:
        return False
Exemple #29
0
def process_files(files):
    title_map = {}
    for file in files:
        # parse filename
        title, _, _ = util.parse_file(os.path.basename(file))

        # if not already seen, add this file to the map
        if title not in title_map:
            title_map[title] = file
            continue

        # we need to merge pdfs into 1
        merger = PdfFileMerger()
        with open(title_map[title], "rb") as f:
            merger.append(PdfFileReader(f))
        with open(file, "rb") as f:
            merger.append(PdfFileReader(f))
        merger.write(title_map[title])
        os.remove(file)

    return title_map
 def set_bglj(self, filename):
     if filename.endswith('.pdf'):
         self.user_data['bgms'] = '1'
         # 获取PDF 页码
         try:
             pdf_read_obj = PdfFileReader(filename)
             self.user_data['bgym'] = pdf_read_obj.getNumPages()
         except Exception as e:
             self.user_data['bgym'] = 0
             print("获取报告页码出错,错误信息:%s" % e)
     else:
         self.user_data['bgms'] = '0'
     self.user_data['bglj'] = self.cur_dir