コード例 #1
2
    def pdf(self):
        """Fix other peoples missing docstrings."""
        pdf = None
        try:
            pdf = PdfFileReader(StringIO(self.data))
        except Exception:
            logger.warn('Error opening pdf file, trying to fix it...')
            fixed_data = self._fixPdf(self.data)

            # try to reopen the pdf file again
            try:
                pdf = PdfFileReader(StringIO(fixed_data))
            except Exception:
                logger.warn('This pdf file cannot be fixed.')

        if pdf and pdf.isEncrypted:
            try:
                decrypt = pdf.decrypt('')
                if decrypt == 0:
                    logger.warn('This pdf is password protected.')
            except Exception:
                logger.warn('Errors while decrypting the pdf file.')

        if pdf is None:
            remove_image_previews(self.context)

        return pdf
コード例 #2
0
def read_or_save():
    switch = read_save_dropdown.get()
    pdf_f = read_pdf_input.get()
    save_f = save_to_file_input.get()
    cracked_p = read_pass_input.get()
    canvas.update()
    if switch == "Save":
        start_read_button.config(text=f"{switch}ing file!")

        with open(pdf_f, 'rb') as input_file, open(save_f,
                                                   'wb') as output_file:
            reader = PdfFileReader(input_file)
            reader.decrypt(cracked_p)
            writer = PdfFileWriter()

            for i in range(reader.getNumPages()):
                writer.addPage(reader.getPage(i))
            writer.write(output_file)
    else:
        start_read_button.config(text=f"{switch}ing pdf. Check Terminal!")

        with open(pdf_f, 'rb') as input_file:
            reader = PdfFileReader(input_file)
            reader.decrypt(cracked_p)
            for i in range(reader.getNumPages()):
                page = reader.getPage(i)
                page_content = page.extractText()
                print(
                    f"-----------------------------------\nOUTPUT:\n\n{page_content}"
                    f"\n-----------------------------------")
コード例 #3
0
def merge_pdf(filepath, outfilename='.merge.pdf'):
    """ merge_pdf
        合并同一个文件夹下所有PDF文件
    :return: outfile 输出压缩后的pdf全路径
    """
    filepath_2nd = os.path.dirname(filepath)
    filename = os.path.basename(filepath) + outfilename
    outfile = os.path.join(filepath_2nd, filename)

    pdfwriter = PdfFileWriter()
    outputPages = 0
    pdf_files = get_filename(filepath, filetypes=['.pdf'])
    for _file in pdf_files:
        pdreader = PdfFileReader(open(_file, 'rb'))
        if pdreader.isEncrypted == True:  
            pdreader.decrypt("map")  # 如果pdf文件已经加密,必须首先解密才能使用pyPdf
                        
        pageCount = pdreader.getNumPages()  # 获得源pdf文件中页面总数
        outputPages += pageCount
        print(_file, pageCount)
        # 分别将page添加到输出output中
        for iPage in range(0, pageCount):
            pdfwriter.addPage(pdreader.getPage(iPage))
    print("All Pages Number:" + str(outputPages))
    
    with open(outfile, "wb") as f:
        pdfwriter.write(f)
    print('outfile=%s' % outfile)        
    return outfile
コード例 #4
0
ファイル: handlers.py プロジェクト: lnutimura/camelot
    def _save_pages(self, filepath, pages, temp):
        """Saves specified page from PDF into a temporary directory.

        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        pages : int
            Page numbers.
        temp : str
            Tmp directory.

        """
        with open(filepath, "rb") as fileobj:
            infile_original = PdfFileReader(fileobj, strict=False)
            if infile_original.isEncrypted:
                infile_original.decrypt(self.password)

            for page in pages:
                # Ensure PdfFileReader object is unmodified
                infile = copy.copy(infile_original)
                fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
                froot, fext = os.path.splitext(fpath)
                p = infile.getPage(page - 1)
                outfile = PdfFileWriter()
                outfile.addPage(p)
                with open(fpath, 'wb') as f:
                    outfile.write(f)

                # Orient rotated pages correctly
                """
コード例 #5
0
ファイル: parser.py プロジェクト: ekulos/eea.converter
    def _parsepdf(self, pdf, password='', **kwargs):
        """ parses the given pdf file and returns a mapping of attributes """

        # This will store the parsed metadata
        META_MAP = {}

        opdf = PdfFileReader(pdf)

        if password != "":
            opdf.decrypt(password)

        metadata = opdf.getXmpMetadata()

        if getattr(metadata, 'pdf_keywords', None):
            META_MAP['keywords'] = metadata.pdf_keywords
        if getattr(metadata, 'dc_language', None):
            META_MAP['language'] = metadata.dc_language
        if getattr(metadata, 'dc_identifier', None):
            META_MAP['uuid'] = metadata.dc_identifier
        if getattr(metadata, 'xmpmm_documentId', None):
            META_MAP['uuid'] = metadata.xmpmm_documentId
        if getattr(metadata, 'xmpmm_instanceId', None):
            META_MAP['uuid'] = metadata.xmpmm_instanceId
        if getattr(metadata, 'xmp_createDate', None):
            META_MAP['creationdate'] = metadata.xmp_createDate
        if getattr(metadata, 'xmp_modifyDate', None):
            META_MAP['modificationdate'] = metadata.xmp_modifyDate
        if getattr(metadata, 'xmp_metadataDate', None):
            META_MAP['metadatadate'] = metadata.xmp_metadataDate
        if getattr(metadata, 'dc_rights', None):
            META_MAP['rights webstatement'] = metadata.dc_rights
        if getattr(metadata, 'pdf_producer', None):
            META_MAP['producer'] = metadata.pdf_producer
        if getattr(metadata, 'xmp_creatorTool', None):
            META_MAP['creatortool'] = metadata.xmp_creatorTool
        if getattr(metadata, 'dc_title', None):
            META_MAP['title'] = metadata.dc_title
        if getattr(metadata, 'dc_description', None):
            META_MAP['description'] = metadata.dc_description
        if getattr(metadata, 'dc_rights', None):
            META_MAP['rights'] = metadata.dc_rights
        if getattr(metadata, 'dc_format', None):
            META_MAP['format'] = metadata.dc_format
        if getattr(metadata, 'dc_creator', None):
            META_MAP['creator'] = metadata.dc_creator

        if getattr(metadata, 'custom_properties', None):
            META_MAP.update(metadata.custom_properties)

        l = self._guessLanguage(pdf)
        if l and not META_MAP.has_key('language'):
            META_MAP['language'] = l

        # Finally we'll do some plone specific rewritings
        # It would be smart to hook some kind of adapter
        # here so that one can define his own rewritings
        if META_MAP.has_key('keywords'):
            META_MAP['subject_keywords'] = list(META_MAP['keywords'])

        return META_MAP
コード例 #6
0
def convert_coords(pdf_name: str, detected_obj: list):
    PDFfile = PdfFileReader(open(pdf_name, 'rb'))
    if PDFfile.isEncrypted:
        PDFfile.decrypt('')
    PDFcoords = PDFfile.getPage(0).mediaBox

    pdf_width = PDFcoords[2]
    pdf_height = PDFcoords[3]

    # Taking out coords, and translating them for camelot
    detected_coords = detected_obj
    coords = [int(coord) for coord in detected_coords]

    x1 = int(coords[0])
    y1 = int(IMG_HEIGHT - coords[1])
    x2 = int(coords[2])
    y2 = int(IMG_HEIGHT - coords[3])
    coords_img = [x1, y1, x2, y2]

    pdf_img_ratio = pdf_height / IMG_HEIGHT
    coords_pdf = [float(pdf_img_ratio * x) for x in coords_img]

    coords_camelot = str(coords_pdf)[1:-1]

    return coords_camelot
コード例 #7
0
ファイル: utils.py プロジェクト: raincoldz/pytool
def deletePDF(input_dirPath, output_dirPath, delete_page):
    '''
    删除PDF文件中的指定页码
    '''
    output = PdfFileWriter()
    deleteInterval = getDeleteInterval(delete_page)
    print(deleteInterval)

    # 读取源pdf文件
    input = PdfFileReader(open(input_dirPath, "rb"))

    # 如果pdf文件已经加密,必须首先解密才能使用pyPdf
    if input.isEncrypted:
        input.decrypt("map")

    # 获得源pdf文件中页面总数
    pageCount = input.getNumPages()
    outputPages = pageCount - len(deleteInterval)
    print(pageCount)

    # 分别将page添加到输出output中
    for iPage in range(1, pageCount + 1):
        if iPage not in deleteInterval:
            output.addPage(input.getPage(iPage - 1))

    print("All Pages Number:" + str(outputPages))
    # 最后写pdf文件
    outputStream = open(output_dirPath, "wb")
    output.write(outputStream)
    outputStream.close()
    print("finished")
コード例 #8
0
def MergePDF(filepath, outfile):
    output = PdfFileWriter()
    outputPages = 0
    pdf_fileName = getFileName(filepath)
    for each_file in pdf_fileName:
        print("adding %s" % each_file)
        # 读取源pdf文件
        input = PdfFileReader(open(each_file, "rb"))

        # 如果pdf文件已经加密,必须首先解密才能使用pyPdf
        if input.isEncrypted == True:
            input.decrypt("map")

        # print(each_file[:-4])

        # 获得源pdf文件中页面总数
        pageCount = input.getNumPages()
        outputPages += pageCount
        print("%s has %d pages" % (each_file, pageCount))

        # 分别将page添加到输出output中
        for iPage in range(pageCount):
            output.addPage(input.getPage(iPage))

        # 添加书签
        output.addBookmark(title=each_file[:-3],
                           pagenum=outputPages - pageCount)

    print("All Pages Number: " + str(outputPages))
    # 最后写pdf文件
    outputStream = open(filepath + outfile, "wb")
    output.write(outputStream)
    outputStream.close()
    print("finished")
コード例 #9
0
def merge_pdf(path: str, output_filename: str, bookmark_separator: str = "", bookmark_start_index: int = 1,
              password: str = "") -> None:
    """
    合并一个文件里所有的pdf
    :param str path: 文件夹路径
    :param str output_filename: 输出文件名(包含路径)
    :param str bookmark_separator: 用来分割每一个pdf的书签格式, 如果没有会按照文件名命名书签
    :param int bookmark_start_index: 书签后缀开始的序号
    :param str password: 如果pdf有加密,这里填pdf的密码
    """
    if os.path.exists(output_filename):
        os.remove(output_filename)
    os.chmod(path, stat.S_IRWXU)  # ensure we have permission
    output_pdf = PdfFileMerger()
    output_page_num = 0
    for index, pdf_path_with_name in enumerate(get_pdf_names(path), bookmark_start_index):
        print(pdf_path_with_name)
        with open(pdf_path_with_name, "rb") as pdf:
            content = PdfFileReader(pdf)
            if content.isEncrypted:
                content.decrypt(password)
            # add bookmark at the beginning of each merged pdf if bookmark_separator is not None
            if bookmark_separator:
                output_pdf.addBookmark(bookmark_separator + str(index), output_page_num)
            else:
                output_pdf.addBookmark(pdf_path_with_name.split("\\")[-1].split(".")[0], output_page_num)
            output_pdf.append(content)
            output_page_num += content.numPages

    with codecs.open(output_filename, "wb") as f:
        output_pdf.write(f)
    print("mission complete")
コード例 #10
0
def MergePDF(filepath, outfile):
    """
    将文件夹里面的pdf文件合并成一个文件
    :param filepath:
    :param outfile:
    :return:
    """
    output = PdfFileWriter()
    outputPages = 0
    pdf_fileName = getFileName(filepath, '.pdf')
    for each in pdf_fileName:
        input = PdfFileReader(open(each, 'rb'))

        if input.isEncrypted == True:
            input.decrypt('map')

        pageCount = input.getNumPages()
        outputPages += pageCount
        for iPage in range(0, pageCount):
            output.addPage(input.getPage(iPage))

    outputStream = open(outfile, 'wb')
    output.write(outputStream)
    outputStream.close()
    print('save:' + outfile + ' finished!')
コード例 #11
0
ファイル: pdflib.py プロジェクト: imze5z/python_study
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
    """添加水印
    """
    pdf_output = PdfFileWriter()
    input_stream = open(pdf_file_in, 'rb')
    pdf_input = PdfFileReader(input_stream)

    # PDF文件被加密了
    if pdf_input.getIsEncrypted():
        print('该PDF文件被加密了.')
        # 尝试用空密码解密
        try:
            pdf_input.decrypt('')
        except Exception as e:
            print('尝试用空密码解密失败.')
            return False
        else:
            print('用空密码解密成功.')
    # 获取PDF文件的页数
    page_num = pdf_input.getNumPages()
    # 读入水印pdf文件
    pdf_watermark_input_stream = open(pdf_file_mark, 'rb')
    pdf_watermark = PdfFileReader(pdf_watermark_input_stream)
    # 给每一页打水印
    for i in range(page_num):
        page = pdf_input.getPage(i)
        page.mergePage(pdf_watermark.getPage(0))
        page.compressContentStreams()  # 压缩内容
        pdf_output.addPage(page)
    output_stream = open(pdf_file_out, "wb")
    pdf_output.write(output_stream)
    input_stream.close()
    pdf_watermark_input_stream.close()
    output_stream.close()
コード例 #12
0
ファイル: pdf.py プロジェクト: uees/happyWork
def add_watermark(pdf_file_mark, pdf_file_in, pdf_file_out):
    with open(pdf_file_in, 'rb') as fp:
        pdf_input = PdfFileReader(fp)

        # PDF文件被加密了
        if pdf_input.getIsEncrypted():
            print('该PDF文件被加密了.')
            # 尝试用空密码解密
            try:
                pdf_input.decrypt('')
            except Exception:
                print('尝试用空密码解密失败.')
                return False
            else:
                print('用空密码解密成功.')

        # 获取PDF文件的页数
        pageNum = pdf_input.getNumPages()

        with open(pdf_file_mark, 'rb') as mfp:
            pdf_output = PdfFileWriter()
            # 读入水印pdf文件
            pdf_watermark = PdfFileReader(mfp)

            # 给每一页打水印
            for i in range(pageNum):
                page = pdf_input.getPage(i)
                page.mergePage(pdf_watermark.getPage(0))
                page.compressContentStreams()  # 压缩内容
                pdf_output.addPage(page)

            with open(pdf_file_out, 'wb') as wfp:
                pdf_output.write(wfp)
コード例 #13
0
ファイル: JoinPDF.py プロジェクト: dnmczfh/python3
def MergePDF(filepath, outfile):
    output = PdfFileWriter()
    outputPages = 0
    pdf_fileName = getFileName(filepath)
    for each in pdf_fileName:
        print(each)
        # 读取源pdf文件
        input = PdfFileReader(each)

        # 如果pdf文件已经加密,必须首先解密才能使用pyPdf
        if input.isEncrypted == True:
            input.decrypt("map")

        # 获得源pdf文件中页面总数
        pageCount = input.getNumPages()
        outputPages += pageCount
        print(pageCount)

        # 分别将page添加到输出output中
        for iPage in range(0, pageCount):
            output.addPage(input.getPage(iPage))

    print("All Pages Number:" + str(outputPages))
    # 最后写pdf文件
    outputStream = file(filepath + outfile, "wb")
    output.write(outputStream)
    outputStream.close()
    print("finished")
コード例 #14
0
    def convert2img(self, path_to_save_img):

        file_name = get_file_name_without_extension(self.filepath)

        if not os.path.exists(path_to_save_img):
            os.makedirs(path_to_save_img)
                
        if not os.path.exists(f'{path_to_save_img}/{file_name}'):
            os.makedirs(f'{path_to_save_img}/{file_name}')

        with open(self.filepath, 'rb') as file:
            
            # initialize the PDF reader object
            reader = PdfFileReader(file) 
            
            if reader.isEncrypted:
                reader.decrypt(self.password)

                base_file_name = get_file_name(self.filepath)
                temp_file_loc = os.path.join(path_to_save_img, file_name, base_file_name)

                with open(temp_file_loc, 'wb') as pdf_file:
                    writer = PdfFileWriter()
                    for page in range(reader.getNumPages()):
                        writer.addPage(reader.getPage(page))
                    writer.write(pdf_file)
                
                convrt_img(temp_file_loc, path_to_save_img)

                os.remove(temp_file_loc)
            
            else:
                convrt_img(self.filepath, path_to_save_img)
コード例 #15
0
ファイル: iocp.py プロジェクト: walt1998/ioc_parser
    def parse_pdf_pypdf2(self, f, fpath):
        text = ""
        iocs = None
        try:
            pdf = PdfFileReader(f, strict=False)

            if pdf.isEncrypted:
                pdf.decrypt('')

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in pdf.pages:
                page_num += 1

                data = page.extractText()

                # Parse IOCs
                temp_iocs = self.parse_page(fpath, data,
                                            page_num)  # parse_page

                # Add IOCs to collection
                iocs.extend(temp_iocs)

                # Add new page
                text += data

            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
        return text, iocs
コード例 #16
0
def MergePDF(filepath, outfile):
    output = PdfFileWriter()
    outputPages = 0
    pdf_fileName = getFileName(filepath)
    pdf_fileName = sorted(pdf_fileName)
    for each in pdf_fileName:
        input = PdfFileReader(open(each, "rb"), strict=False)
        if input.isEncrypted == True:
            input.decrypt("map")

        # 获得源pdf文件中页面总数
        pageCount = input.getNumPages()
        outputPages += pageCount
        print(pageCount)

        # 分别将page添加到输出output中
        for iPage in range(0, pageCount):
            output.addPage(input.getPage(iPage))

    print("All Pages Number:" + str(outputPages))
    # 最后写pdf文件
    outputStream = open(filepath + outfile, "wb")
    output.write(outputStream)
    outputStream.close()
    print("finished")
コード例 #17
0
ファイル: scratcher.py プロジェクト: humblepoti/scratcher
 def downloadpdf(url):
     try:
         request = requests.get(url, verify=False)
         if 'Content-Type' in request.headers.keys():
             if request.headers['Content-Type'] == 'text/html':
                 return None
     except requests.exceptions.ConnectionError:
         sys.exit(
             "\nThere was an error when trying to connect to the domain. Please confirm if the domain is "
             "correctly written.\n")
     try:
         objbyte = BytesIO(request.content)
     except Exception as e:
         Scratcher.log(url, e)
         sys.exit(
             "\nThere was an error when trying to convert the content of the response.Please verify the logs to"
             " see the raised error.\n")
     try:
         pdf = PdfFileReader(objbyte)
     except utils.PdfReadError as e:
         Scratcher.log(url, e)
         obje = BytesIO(request.content.strip(b'\x00'))
         try:
             pdf = PdfFileReader(obje)
         except utils.PdfReadError:
             return 2
     if pdf.getIsEncrypted() is True:
         try:
             pdf.decrypt('')
         except:
             pdf = Scratcher.handlepdf(request.content)
     return pdf
コード例 #18
0
ファイル: iocp.py プロジェクト: 453483289/ioc_parser
    def parse_pdf_pypdf2(self, f, fpath):
		text = ""
		iocs = None
		try:
			pdf = PdfFileReader(f, strict = False)

			if pdf.isEncrypted:
				pdf.decrypt('')

			if self.dedup:
				self.dedup_store = set()

			self.handler.print_header(fpath)
			page_num = 0
			for page in pdf.pages:
				page_num += 1

				data = page.extractText()
				
				# Parse IOCs
				temp_iocs = self.parse_page(fpath, data, page_num) # parse_page
				
				# Add IOCs to collection
				iocs.extend(temp_iocs)
				
				# Add new page
				text += data
				
			self.handler.print_footer(fpath)
		except (KeyboardInterrupt, SystemExit):
			raise
		except Exception as e:
			self.handler.print_error(fpath, e)
		return text, iocs
コード例 #19
0
ファイル: task.py プロジェクト: Lamaun/excalibur
def save_page(filepath, page_number):
    infile = PdfFileReader(open(filepath, 'rb'), strict=False)
    page = infile.getPage(page_number - 1)
    outfile = PdfFileWriter()
    outfile.addPage(page)
    outpath = os.path.join(os.path.dirname(filepath), 'page-{}.pdf'.format(page_number))
    with open(outpath, 'wb') as f:
        outfile.write(f)
    froot, fext = os.path.splitext(outpath)
    layout, __ = get_page_layout(outpath)
    # fix rotated PDF
    chars = get_text_objects(layout, ltype="char")
    horizontal_text = get_text_objects(layout, ltype="horizontal_text")
    vertical_text = get_text_objects(layout, ltype="vertical_text")
    rotation = get_rotation(chars, horizontal_text, vertical_text)
    if rotation != '':
        outpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
        os.rename(outpath, outpath_new)
        infile = PdfFileReader(open(outpath_new, 'rb'), strict=False)
        if infile.isEncrypted:
            infile.decrypt('')
        outfile = PdfFileWriter()
        p = infile.getPage(0)
        if rotation == 'anticlockwise':
            p.rotateClockwise(90)
        elif rotation == 'clockwise':
            p.rotateCounterClockwise(90)
        outfile.addPage(p)
        with open(outpath, 'wb') as f:
            outfile.write(f)
コード例 #20
0
def count_pages(filename: str) -> int:
    with open(filename, "rb") as f:
        pdf = PdfFileReader(f)
        if pdf.isEncrypted:
            pdf.decrypt("")
        n_pages = pdf.getNumPages()
    return n_pages
def get_edition2(i):
    
    pdf = PdfFileReader(i)

    if pdf.isEncrypted:
        pdf.decrypt('')
    list_me = []    
    for j in (1,2):
        
        pageObj = pdf.getPage(j)
        text = pageObj.extractText()
    
        year = re.search(r'© \d\d\d\d', text)
        if year :
            
            edition_dic.update({'Creation_Year':year.group()[2:6]})
        edition = re.search(r'\d*\w\w edition', text)
   
        if edition:
            
            edition_dic.update({'Edition':edition.group()})
            
        match = re.search(r'ISBN [\d*-]*\d*', text)
        if match :
           
            edition_dic.update({'ISBN':match.group()[5:]})
            
    
    return edition_dic
コード例 #22
0
def split_pdf_pages(pdf_input_path: str, output_folder: str, max_pages=None):
    """
    Split pdf into individual pages and save to output_folder with name
    filename + _page_x.pdf

    if max_pages is provided, only take up to the amount provided.
    Pages will be selected at random
    """
    # create staging file for filename issues
    staging_file_handle = tempfile.TemporaryFile()

    with open(pdf_input_path, "rb") as f:
        shutil.copyfileobj(f, staging_file_handle)
        staging_file_handle.seek(0)

    pdf = PdfFileReader(staging_file_handle, strict=False)
    if pdf.isEncrypted:
        pdf.decrypt("")

    page_numbers = range(pdf.numPages)

    # Take random page numbers if max_pages is provided
    if max_pages and pdf.numPages > max_pages:
        page_numbers = random.sample(page_numbers, max_pages)

    for page_num in page_numbers:
        out_pdf = PdfFileWriter()
        pdf_page_filepath = pageFilename(pdf_input_path, page_num)
        pdf_filename = os.path.basename(pdf_page_filepath)
        output_filepath = os.path.join(output_folder, pdf_filename)
        out_pdf.addPage(pdf.getPage(page_num))
        with open(output_filepath, "wb") as f:
            out_pdf.write(f)
    staging_file_handle.close()
コード例 #23
0
ファイル: parser.py プロジェクト: ekulos/eea.converter
    def _parse(self, pdf, password='', **kwargs):
        """ parses the given pdf file and returns a mapping of attributes """
        metadata = self._parsepdf(pdf, password)
        if not metadata:
            metadata = {}

        opdf = PdfFileReader(pdf)
        if password:
            opdf.decrypt(password)

        info = opdf.getDocumentInfo()

        new_metadata = dict(
            (key.strip('/').lower(), val) for key, val in info.items())

        # #116365 use title from pdf parsing instead of the data coming
        # from pypdf2 getDocumentInfo method as that method will wrongly
        # encode an mdash found in the title
        parsed_title = metadata.get('title', {}).get('x-default')
        if parsed_title:
            new_metadata['title'] = parsed_title

        metadata.update(new_metadata)
        #
        # Fix some metadata
        #
        metadata = self._fix_metadata(metadata)
        return metadata
コード例 #24
0
    def downloadpdf(url):
        try:
            request = requests.get(url, verify=False)
            if request.headers['Content-Type'] == 'text/html':
                return None
        except requests.exceptions.ConnectionError:
            sys.exit(
                "\nThere was an error when trying to connect to the domain. Please confirm if the domain is correctly written.\n"
            )
        try:
            objbyte = BytesIO(request.content)
        except Exception as e:
            print(e)
            return None
        try:
            s_stdout = sys.stdout
            sys.stdout = BytesIO()
            pdf = PdfFileReader(objbyte)
            sys.stdout = s_stdout
        except Exception as e:
            print(e)
            return None
        if pdf.getIsEncrypted() is True:
            try:
                pdf.decrypt('')

            except:
                pdf = Scratcher.handlepdf(request.content)

        return pdf
コード例 #25
0
def save_page(filepath, page_number):
    infile = PdfFileReader(open(filepath, "rb"), strict=False)
    page = infile.getPage(page_number - 1)
    outfile = PdfFileWriter()
    outfile.addPage(page)
    outpath = os.path.join(os.path.dirname(filepath),
                           f"page-{page_number}.pdf")
    with open(outpath, "wb") as f:
        outfile.write(f)
    froot, fext = os.path.splitext(outpath)
    layout, __ = get_page_layout(outpath)
    # fix rotated PDF
    chars = get_text_objects(layout, ltype="char")
    horizontal_text = get_text_objects(layout, ltype="horizontal_text")
    vertical_text = get_text_objects(layout, ltype="vertical_text")
    rotation = get_rotation(chars, horizontal_text, vertical_text)
    if rotation != "":
        outpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
        os.rename(outpath, outpath_new)
        infile = PdfFileReader(open(outpath_new, "rb"), strict=False)
        if infile.isEncrypted:
            infile.decrypt("")
        outfile = PdfFileWriter()
        p = infile.getPage(0)
        if rotation == "anticlockwise":
            p.rotateClockwise(90)
        elif rotation == "clockwise":
            p.rotateCounterClockwise(90)
        outfile.addPage(p)
        with open(outpath, "wb") as f:
            outfile.write(f)
コード例 #26
0
ファイル: pdfsplit.py プロジェクト: spanth/pdfsplit
def split_pdf(input_file, output_file, pages):
    fp = open(input_file, "rb")
    inputpdf = PdfFileReader(fp)

    if inputpdf.isEncrypted:
        try:
            inputpdf.decrypt('')
            print('File Decrypted (PyPDF2)')
        except:
            command = ("cp " + input_file +
                       " temp.pdf; qpdf --password='' --decrypt temp.pdf " +
                       input_file + "; rm temp.pdf")
            os.system(command)
            print('File Decrypted (qpdf)')
            fp = open(input_file)
            inputpdf = PdfFileReader(fp)

    outputpdf = PdfFileWriter()

    newpages = []
    for p in pages:
        if '-' in p:
            num_range = p.split('-')
            for r in range(int(num_range[0]), int(num_range[1]) + 1):
                newpages.append(r)
        else:
            newpages.append(p)

    print(newpages)
    pages_int = [int(x) for x in newpages]

    for i in pages_int:
        outputpdf.addPage(inputpdf.getPage(i - 1))
    with open(output_file, "wb") as outf:
        outputpdf.write(outf)
コード例 #27
0
    def _get_pages(self, filepath, pages):
        """Converts pages string to list of ints

        pages = '1', '2,5,8-all', 'all'
        """
        page_numbers = []
        
        with open(self.filepath, 'rb') as file:
            reader = PdfFileReader(file)
                    
            if reader.isEncrypted:
                reader.decrypt(self.password)
            
            num_of_pages = reader.getNumPages()
        
        if pages == "all":
            page_numbers.append({'start': 1, 'end': num_of_pages})
        else:
            for p in pages.split(','):
                if '-' in p:
                    a, b = p.split('-')

                    if b == 'all':
                        b = num_of_pages

                    page_numbers.append({'start': int(a), 'end': int(b)})

                else:
                    page_numbers.append({'start': int(p), 'end': int(p)})

        page_numbers_lst = []
        for page in page_numbers:
            page_numbers_lst.extend(range(page['start'], page['end']+1))

        return sorted(set(page_numbers_lst))   
コード例 #28
0
ファイル: pdflib.py プロジェクト: shi-cong/PYSTUDY
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
    """添加水印
    """
    pdf_output = PdfFileWriter()
    input_stream = open(pdf_file_in, 'rb')
    pdf_input = PdfFileReader(input_stream)

    # PDF文件被加密了
    if pdf_input.getIsEncrypted():
        print('该PDF文件被加密了.')
        # 尝试用空密码解密
        try:
            pdf_input.decrypt('')
        except Exception as e:
            print('尝试用空密码解密失败.')
            return False
        else:
            print('用空密码解密成功.')
    # 获取PDF文件的页数
    page_num = pdf_input.getNumPages()
    # 读入水印pdf文件
    pdf_watermark_input_stream = open(pdf_file_mark, 'rb')
    pdf_watermark = PdfFileReader(pdf_watermark_input_stream)
    # 给每一页打水印
    for i in range(page_num):
        page = pdf_input.getPage(i)
        page.mergePage(pdf_watermark.getPage(0))
        page.compressContentStreams()  # 压缩内容
        pdf_output.addPage(page)
    output_stream = open(pdf_file_out, "wb")
    pdf_output.write(output_stream)
    input_stream.close()
    pdf_watermark_input_stream.close()
    output_stream.close()
コード例 #29
0
def meta_pdf(url_pdf):
    try:
        path = "./%s/%s" % (result.url_s, url_pdf.split("/")[-1])
        headers = {'user-agent': result.user_agentt}
        print tc.bold_yellow("Download: %s" % url_pdf)
        r = requests.get(url_pdf, headers=headers)
        with open(path, 'wb') as f:
            f.write(r.content)
    except:
        print "%s Can't Download or search for metadata" % url_pdf
    try:
        print tc.bold_yellow("File %s metadata:" % path)
        fl = open("./%s/metadata_results.txt" % result.url_s, "a+")
        fl.write("---Url: %s --- \n" % url_pdf)
        fl.write("--------> File %s metadata: <------------ \n" % path)
        fp = open(path, 'rb')
        pdf = PdfFileReader(fp)
        if pdf.isEncrypted:
            pdf.decrypt('')
        info = pdf.getDocumentInfo()
        for i in info:
            print tc.italic_yellow(i + ": " + info[i])
            fl_write_line = "    " + i + ": " + info[i] + "\n"
            fl.write(fl_write_line.encode('utf8'))
        fl.close()
        fp.close()
    except:
        print tc.bold_red("Can't read metadata in: %s" % path)
コード例 #30
0
def decrypt(query, pdfs):
    """Decrypt PDF files."""
    try:
        for pdf in pdfs:
            reader = PdfFileReader(pdf, strict=False)

            if reader.isEncrypted:
                reader.decrypt(query)
                writer = PdfFileWriter()

                for i in xrange(reader.numPages):
                    writer.addPage(reader.getPage(i))

                noextpath = os.path.splitext(pdf)[0]
                out_file = "{} (decrypted).pdf".format(noextpath)

                with open(out_file, 'wb') as f:
                    writer.write(f)

                notify.notify('Alfred PDF Tools',
                              'Decryption successfully completed.')

            else:
                notify.notify('Alfred PDF Tools',
                              'The PDF file is not encrypted.')

    except PdfReadError:
        notify.notify('Alfred PDF Tools',
                      'The entered password is not valid.')
コード例 #31
0
ファイル: pdf_helper.py プロジェクト: culiutudousi/PdfHelper
class PdfHelper:
    def __init__(self, file_name):
        self.file_name = file_name
        self.reader = PdfFileReader(self.file_name, strict=False)
        self.num_pages = self.reader.getNumPages()
        if self.reader.isEncrypted:
            print('Trying to decrypt ...')
            try:
                self.reader.decrypt('')
                print('Success!')
            except:
                print('Failed to decrypt')

    def split_pages(self, start_page, end_page, output_name):
        if start_page <= end_page <= self.num_pages:
            self.select_pages(range(start_page, end_page + 1), output_name)
        else:
            print(
                'ERROR: page number out of range: start {}, end {}, total {}.'.
                format(start_page, end_page, self.num_pages))

    def select_pages(self, pages, output_name):
        if max(pages) - 1 <= self.num_pages:
            writer = PdfFileWriter()
            for p in pages:
                writer.addPage(self.reader.getPage(p - 1))
            with open(output_name, 'wb') as outfile:
                writer.write(outfile)
        else:
            print('ERROR: page number out of range: max {}, total {}.'.format(
                max(pages), self.num_pages))
コード例 #32
0
    def merge_pdf(self, data, base_date):
        data_length = len(data[:, 0])

        for i in range(data_length):

            company_name = data[i, 0]
            bizno = data[i, 1]
            department = data[i, 2]
            director_name = data[i, 3]

            accounts_balance = data[i, 4]
            accounts_maturity = data[i, 5]
            credit_exposure = data[i, 6]

            action = f'\n{department}/{director_name}/{bizno}-{company_name}의 {base_date} pdf 병합'

            if accounts_balance != 'O' or accounts_maturity != 'O' or credit_exposure != 'O':
                log_str = action + "실패\n"
                self.logger.warning(log_str)
                continue

            file_location = os.path.join(os.getcwd() + "/pdf_file", department,
                                         base_date, director_name,
                                         company_name).replace(' ', '_')

            pdf_merger = PdfFileMerger()
            try:
                for file_name in self.file_list:
                    file_path = os.path.join(
                        file_location,
                        f'{company_name}_{base_date}_{file_name}.pdf').replace(
                            ' ', '_')
                    tmp_file_path = os.path.join(
                        file_location,
                        f'{company_name}_{base_date}_{file_name}_tmp.pdf'
                    ).replace(' ', '_')

                    pdf_file_object = open(file_path, 'rb')
                    pdf_file = PdfFileReader(pdf_file_object)
                    if pdf_file.isEncrypted:
                        try:
                            pdf_file.decrypt('')
                        except:
                            cmd_command = f"qpdf --decrypt \"{file_path}\" \"{tmp_file_path}\" "
                            os.system(cmd_command)
                            pdf_file_object.close()
                            pdf_file_object = open(tmp_file_path, 'rb')
                            pdf_file = PdfFileReader(pdf_file_object)

                    pdf_merger.append(pdf_file)
                    pdf_file_object.close()
                    os.remove(tmp_file_path)

                output_file_path = os.path.join(
                    file_location, f'{company_name}_{base_date}_종합.pdf')
                pdf_merger.write(output_file_path)
            except:
                self.logger.warning(f"\n{action} 병합실패\n")
                continue
コード例 #33
0
    def decrypt_file(self, password):
        reader = PdfFileReader(str(self.read_dir))
        writer = PdfFileWriter()
        reader.decrypt(password)
        writer.appendPagesFromReader(reader)

        with self.write_dir.open(mode='wb') as output_file:
            writer.write(output_file)
コード例 #34
0
def read_data(file_object):

    data = PdfFileReader(file_object)

    if data.isEncrypted:
        data.decrypt('')

    return data
コード例 #35
0
    def pdf(self):
        pdf = None
        try:
            pdf = PdfFileReader(StringIO(self.data))
        except:
            logger.warn("Error opening pdf file, trying to fix it...")
            fixed_data = self._fixPdf(self.data)

            # try to reopen the pdf file again
            try:
                pdf = PdfFileReader(StringIO(fixed_data))
            except:
                logger.warn("This pdf file cannot be fixed.")

        if pdf and pdf.isEncrypted:
            try:
                decrypt = pdf.decrypt('')
                if decrypt == 0:
                    logger.warn("This pdf is password protected.")
            except:
                logger.warn("Errors while decrypting the pdf file.")

        if pdf is None:
            remove_image_previews(self.context)

        return pdf
コード例 #36
0
ファイル: parsers.py プロジェクト: xstevens/recon-ng
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo()
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result
コード例 #37
0
ファイル: gen.py プロジェクト: zhangxj/esbi
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
    pdf_output = PdfFileWriter()
    input_stream = file(pdf_file_in, 'rb')
    pdf_input = PdfFileReader(input_stream)
                                                                               
    # PDF文件被加密了
    if pdf_input.getIsEncrypted():
        print '该PDF文件被加密了.'
        # 尝试用空密码解密
        try:
            pdf_input.decrypt('')
        except Exception, e:
            print '尝试用空密码解密失败.'
            return False
        else:
            print '用空密码解密成功.'
コード例 #38
0
ファイル: pdfmerge.py プロジェクト: metaist/pdfmerge
def add(path, password='', writer=None, rules=RULE_DEFAULT):
    """Add one or more paths to a PdfFileWriter.

    Args:
        path (str, list):       path or list of paths to merge
        password (str):         password for encrypted files
        writer (PdfFileWriter): output writer to add pdf files
        rules (str):            pages and rotation rules

    Returns:
        (PdfFileWriter). The merged PDF ready for output.
    """
    if writer is None:
        writer = PdfFileWriter()

    if isinstance(path, list):  # merge all the paths
        for subpath in path:
            writer = add(subpath, password, writer, rules)
    else:
        match = RE_HAS_RULE.search(path)
        if match:
            path, rules = match.groups()
        rules = re.sub(r'\s', '', rules)  # remove all whitespace

        if os.path.isdir(path):  # merge all pdfs in a directory
            path = os.path.join(path, '*.pdf')

        if '*' in path:  # merge multiple files
            writer = add(glob(path), password, writer, rules)
        else:  # base case; a single file
            assert os.path.isfile(path), ERROR_PATH.format(path)
            reader = PdfFileReader(open(path, 'rb'))
            if reader.isEncrypted:
                reader.decrypt(password)

            for rule in rules.split(','):
                match = RE_RULE.search(rule)
                assert match, ERROR_RULE.format(rule)
                _, _, _, rotate = match.groups()
                for page in rangify(match, reader.getNumPages()):
                    writer.addPage(
                        reader.getPage(page - 1).rotateClockwise(
                            RULE_ROTATE[rotate]
                        )
                    )
    return writer
コード例 #39
0
ファイル: tools.py プロジェクト: blawesom/pdf-merger
def merge_files(local_pdfs):

    name = 'merge_{0}_output.pdf'.format(str(time.clock())[2:])
    merged_export = PdfFileMerger()
    for pdfile in local_pdfs:
        filepath = getpath(pdfile, config().get(section='server', option='upload_folder'))
        file_bin = PdfFileReader(file(filepath, 'rb'))
        if file_bin.getIsEncrypted():
            file_bin.decrypt('')

        merged_export.append(fileobj=file_bin)
        os.remove(filepath)
    full_ouput = getpath(name, config().get(section='server', option='upload_folder'))
    with open(full_ouput, 'wb') as output:
        merged_export.write(output)

    return full_ouput
コード例 #40
0
ファイル: yapot.py プロジェクト: thequbit/yapot
def split_pdf(pdf_filename, temp_dir):
    '''
    Split the PDF into n PDFs ( one for each page ).
    '''
    filenames = []
    inputpdf = PdfFileReader(open(pdf_filename, "rb"))
    if inputpdf.getIsEncrypted():
        inputpdf.decrypt('')
    for i in range(inputpdf.numPages):
        output = PdfFileWriter()
        output.addPage(inputpdf.getPage(i))
        filename = os.path.basename(pdf_filename)
        filename = "{0}/{1}-p{2}.pdf".format(temp_dir, filename, i)
        with open(filename, "wb") as outputStream:
            output.write(outputStream)
        filenames.append(filename)

    return filenames
コード例 #41
0
def extract_creation_date(filename):
    #  Add strict=False in order to avoid 'PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]'
    pdf_toread = PdfFileReader(open(filename, "rb"), strict=False)
    # "file has not been decrypted" error https://github.com/mstamy2/PyPDF2/issues/51
    if pdf_toread.isEncrypted:
        pdf_toread.decrypt('')
    pdf_info = pdf_toread.getDocumentInfo()
    #print(str(pdf_info))
    # PDF Reference, 3.8.3 Dates, http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
    # A date is an ASCII string of the form (D:YYYYMMDDHHmmSSOHH'mm')
    # Examle: D:20170508085336+02'00'
    raw_date = pdf_info['/CreationDate']
    #print(str(raw_date))
    date_str = re.search('^D:(\d{14})', raw_date).group(1)
    #print(str(date_str))
    timestamp = datetime.strptime(date_str, "%Y%m%d%H%M%S")
    #print(str(date))
    return timestamp
コード例 #42
0
ファイル: yapot_utils.py プロジェクト: thequbit/yapot
def split_pdf(pdf_filename):

    filenames = []
    inputpdf = PdfFileReader(open(pdf_filename, "rb"))
    if inputpdf.getIsEncrypted():
        inputpdf.decrypt('')
    for i in range(inputpdf.numPages):
        output = PdfFileWriter()
        output.addPage(inputpdf.getPage(i))
        directory = os.path.dirname(pdf_filename)
        if directory == '':
            directory = '.'
        filename = os.path.basename(pdf_filename)
        filename = "{0}/{1}-p{2}.pdf".format(directory,filename,i)
        with open(filename, "wb") as outputStream:
            output.write(outputStream)
        filenames.append(filename)

    return filenames
コード例 #43
0
ファイル: iohelper.py プロジェクト: cykerway/stapler
def read_pdf(filename):
    """Open a PDF file with PyPDF2."""
    if not os.path.exists(filename):
        raise CommandError("{} does not exist".format(filename))
    pdf = PdfFileReader(file(filename, "rb"))
    if pdf.isEncrypted:
        while True:
            pw = prompt_for_pw(filename)
            matched = pdf.decrypt(pw)
            if matched:
                break
            else:
                print "The password did not match."
    return pdf
コード例 #44
0
ファイル: metadatos.py プロジェクト: hartek/metadatos
def print_pdf(file_full_path, color_mode):
	"""Analyzes the metadata of a .pdf file"""
	# Header with file path
	if color_mode: cprint("\n[+] Metadata for file: %s" % (file_full_path), "green", attrs=["bold"])
	else: print "\n[+] Metadata for file: %s" % (file_full_path)
	# Open the file
	try: 
		pdf_file = PdfFileReader(file(file_full_path, "rb"))
	except: 
		if color_mode: cprint("Could not read this file. Sorry!", "red")
		else: print "Could not read this file. Sorry!"
		return
	if pdf_file.isEncrypted: # Temporary workaround, pdf encrypted with no pass
		try: 
			pdf_file.decrypt('')
		except: 
			if color_mode: cprint("\tCould not decrypt this file. Sorry!", "red")
			else: print "\tCould not decrypt this file. Sorry!"
			return
	# Data structure with document information
	pdf_info = pdf_file.getDocumentInfo()
	# Print metadata
	if pdf_info: 
		for metaItem in pdf_info: 
			try: 
				if color_mode: 
					cprint("\t-" + metaItem[1:] + ": ", "cyan", end="")
					cprint(pdf_info[metaItem])
				else: 
					print "\t-" + metaItem[1:] + ": " + pdf_info[metaItem]
			except TypeError: 
				if color_mode: cprint("\t-" + metaItem[1:] + ": " + "Error - Item not readable", "red")
				else: print "\t-" + metaItem[1:] + ": " + "Error - Item not readable"
	else:
		if color_mode: cprint("\t No data found", "red")
		else: print "\t No data found"
	print ""
コード例 #45
0
ファイル: serve.py プロジェクト: philipdexter/pdfserve
def pdf(which, page=None):
    """Create a page of a pdf and display it"""
    if page is None:
        page = get_saved_page(which)
    if page == 'all':
        return send_file(build_path(which))
    page = int(page)
    pdf_path = build_path(which)
    page_directory = build_path(which, 'pages')
    page_path = build_path(which, 'pages', page)
    if page < 0:
        return redirect('{}/{}'.format(which, 0))
    if not isfile(page_path):
        makedirs(page_directory, exist_ok=True)
        pdfout = PdfFileWriter()
        with open(pdf_path, 'rb') as fin:
            pdfin = PdfFileReader(fin)
            if pdfin.isEncrypted:
                pdfin.decrypt('')
            pdfout.addPage(pdfin.getPage(page))
            with open(page_path, 'wb') as fout:
                pdfout.write(fout)
    set_saved_page(which, page)
    return render_template('index.html', which=which, page=page)
コード例 #46
0
    def __save_task_pdf(self, task):
        try:
            extension = '.pdf'
            sep = task.text_pdf_url.find('#')
            if sep != -1:
                extension = '.zip'

            tmp_pdf_path = os.path.join(self.__tmp_dir, task.key() + extension)
            task.download_text_pdf(tmp_pdf_path)

            if sep != -1:
                tmp_pdf_path = self.__pdf_from_zip(task, tmp_pdf_path, sep)

            with open(tmp_pdf_path, 'rb') as input_stream:
                input_pdf = PdfFileReader(input_stream)
                if input_pdf.isEncrypted:
                    input_pdf.decrypt('')
                output_pdf = PdfFileWriter()
                for page in task.pages:
                    output_pdf.addPage(input_pdf.getPage(page-1))
                with open(self.task_pdf_path(task), 'wb') as output_stream:
                    output_pdf.write(output_stream)
        finally:
            os.remove(tmp_pdf_path)
コード例 #47
0
def __Get_info(file_path, plain_log, csv_log, analyzed_files, total_files):
    """
    Get_info(file_path)
        Opens the pdf file for reading.
    Args:
        - file_path: (string) Absolute file path.
        - plain_log: (None | string) Log file in plain text.
        - csv_log: (None | string) Log file in csv format.
    """

    file_name = os.path.basename(file_path)
    file_size = os.path.getsize(file_path)

    encrypted = 'No'

    try:  # Try to open not password encrypted pdf files and pdf files
          # encrypted with a blank password.
        pdf_file = PdfFileReader(file(file_path, 'rb'))
        if pdf_file.getIsEncrypted() is True:
            dec_res = pdf_file.decrypt('')
            if dec_res == 1:
                encrypted = 'Yes'

        #Get and parse metadata
        doc_info = pdf_file.getDocumentInfo()
        title, author, creator, subject, producer, c_date, m_date \
            = __Parse_doc_info(doc_info)

        num_pages = pdf_file.getNumPages()

        #Group info
        pdf_meta = pdf_metadata(file_name, title, author, creator,
                                subject, producer, c_date, m_date,
                                encrypted, num_pages, file_size)

        __Print_metadata(pdf_meta)

        if plain_log:
            Log(file_name, pdf_meta, plain_log, 'txt')
        if csv_log:
            Log(file_name, pdf_meta, f_log_csv, 'csv')

        analyzed_files = analyzed_files + 1

    except Exception, e:
        error = file_name + ' ' + str(e)
        __Print_error(error)
コード例 #48
0
    def read(self, payload, **kwargs):
        """
        Extract text from a PDF file

        :param bytes payload : Contents of pdf file
        :param **kwargs kwargs: Additional attributes (unused)

        :returns: Extracted content of payload
        :rtype: bytes

        """

        # Ensure the payload if a ByesIO object
        payload_object = BytesIO(payload)

        # Parse the PDF payload
        pdf_object = PdfFileReader(payload_object, strict=False)

        results = []

        # Determine if the pdf is encrypted, if so, let's attempt to decrypt
        if pdf_object.isEncrypted:
            try:
                # Returns 0 if the password failed, 1 if the password matched
                # the user password, and 2 if the password matched the owner
                # password.
                decrypt_return = pdf_object.decrypt(kwargs['pdf_password'])
                if decrypt_return == 0:
                    self.stoq.log.warn("Incorrect PDF encryption password")
            except NotImplementedError:
                self.stoq.log.warn("Unsupported encryption method")
            except:
                self.stoq.log.error("Unable to decrypt PDF. Was a password provided?")

        # Iterate over the pages and append to our 
        for page in pdf_object.pages:
            results.append(page.extractText())

        return "".join(results)
コード例 #49
0
def pdfobj(doc):
    pdf = None
    try:
        pdf = PdfFileReader(StringIO(data(doc)))
    except:
        logger.warn('Error opening pdf file, trying to fix it...')
        fixed_data = _fixPdf(data(doc))

        # try to reopen the pdf file again
        try:
            pdf = PdfFileReader(StringIO(fixed_data))
        except:
            logger.warn('This pdf file cannot be fixed.')

    if pdf and pdf.isEncrypted:
        try:
            decrypt = pdf.decrypt('')
            if decrypt == 0:
                logger.warn('This pdf is password protected.')
        except:
            logger.warn('Errors while decrypting the pdf file.')

    return pdf
コード例 #50
0
ファイル: IMBbinder.py プロジェクト: zom-1/IMBbinder
#!/usr/bin/env python
# coding: utf-8
''' IMBdownloader : download all Internet Magazine Back Number Archives'''
import glob
from PyPDF2 import PdfFileWriter, PdfFileReader

magName = 'InternetMag'
dirs = glob.glob('*/')  # directory list, it’s OK??
for dir in dirs:
    pdfName = magName+dir[:-1]+'.pdf'  # ex) InternetMag194410.pdf
    outPdf = PdfFileWriter()  # make empty pdf
    files = glob.glob(dir+'*.pdf')
    for file in files:
        inPdf = PdfFileReader(open(file, "rb"))
        if inPdf.isEncrypted:  # some pdf were encripted
            inPdf.decrypt("")  # ? why empty password ?
        pageNum = inPdf.getNumPages()-1  # delete last page, !!
        for p in range(0, pageNum):  # !! getNumPages(0) gets page1
            page = inPdf.getPage(p)
            outPdf.addPage(page)
    outPdf.write(open(pdfName, "wb"))
    print pdfName
コード例 #51
0
### Parametrage ###
path = 'practice_files'


############
### MAIN ###
############

# chapter 11.1 review ex 1 : Write a script that opens the file named Walrus.pdf
#   from the Chapter 11 practice files; you will need to decrypt the file using 
#   the password "IamtheWalrus"

input_file_name = os.path.join(path, 'Walrus.pdf')
input_file = PdfFileReader(open(input_file_name, 'rb'))
input_file.decrypt('IamtheWalrus') # decrypt password protected file

output_PDF = PdfFileWriter()

# chapter 11.1 review ex 2 : Rotate every page in this input file counter-clockwise by 90 degrees

for current_page in range(0, input_file.getNumPages()):
    page = input_file.getPage(current_page)
    page.rotateClockwise(-90) # rotate left 90°
    
    # chapter 11.1 review ex 3 : Split each page in half vertically, such that 
    # every column appears on its own separate page, and 
    page_left = input_file.getPage(current_page)
    page_right = copy.copy(page_left)
    
    upper_right = page_left.mediaBox.upperRight
コード例 #52
0
ファイル: portable_documents.py プロジェクト: alvra/fity
    def extract(cls, file):
        try:
            reader = PdfFileReader(file)
        except PdfReadError as e:
            raise six.raise_from(ExtractionError("Could not open pdf reader"), e)
        except TypeError as e:
            if str(e) == "'NumberObject' object has no attribute '__getitem__'":
                # there's a bug in PyPDF2 for some pdf valid files
                # 
                return cls(file, dict())
            else:
                raise

        if reader.isEncrypted:
            try:
                # try to decrypt it with an empty password
                success = reader.decrypt('')
            except NotImplementedError:
                # the document uses an unsupported encryption method
                # it's (probably) a real pdf document though,
                # we just can't extract its metadata without the password
                return cls(file, dict())
            else:
                if success == 0:
                    # the password failed
                    # it's (probably) a real pdf document though,
                    # we just can't extract its metadata without the password
                    return cls(file, dict())
            # for success values 1 and 2 we should now be able to read the document

        props = OrderedDict()

        try:
            props['pages'] = reader.numPages
        except PdfReadError:
            pass

        try:
            info = reader.documentInfo
        except PdfReadError:
            info = None

        if info is None:
            return cls(file, props)

        for key, prop, parser in (
            ('Title', 'title', None),
            ('Subject', 'subject', None),
            ('Author', 'author', None),
            ('Creator', 'creator', None),
            ('Producer', 'producer', None),
            ('CreationDate', 'created', parse_date),
            ('ModDate', 'modified', parse_date),
        ):
            try:
                value = info['/%s' % key]
            except KeyError:
                pass
            else:
                if value is not None:
                    if parser:
                        value = parser(value)
                        if value is None:
                            continue
                    props[prop] = value

        return cls(file, props)
コード例 #53
0
ファイル: base_meeting.py プロジェクト: milieuinfo/minaraad
    def generate_pdf(self):
        """ Checks all items and linked files to generate a huge PDF
        with all files concatenated.
        This action might be quite expensive, so it should not be called
        too often.

        The only reasons when it is called should be:
        - when a FileAttachment is modified (see events.concatenate_pdf)
        - when a FileAttachment is deleted (see
          base_agendaitem.manage_delObjects)
        - when an agenda item is deleted (see base_meeting.manage_delObjects)
        """
        files = []
        for item in self.find_items():
            item = item.getObject()
            for att_id in item.contentIds():
                if item.is_attachment_pdf(att_id):
                    files.append(
                        {'file': StringIO(item[att_id].getFile()),
                         'attachment': '%s/%s' % (item.absolute_url(),
                                                  att_id)})

        if not files:
            self.pdf = None
            return

        self.pdf = PdfFileWriter()

        # Settings when a custom page has to be written.
        font = "Helvetica"
        font_size = 12

        for f in files:
            pdf = PdfFileReader(f['file'])
            if pdf.isEncrypted:
                try:
                    if pdf.decrypt('') == 0:
                        # There is two cases:
                        # - the decrypt method raise an error because
                        #   it can not decrypt
                        # - the decrypt method just returns 0 to tell
                        #   it was not able to decrypt (in this case, we
                        #   raise an exception ourself to create the
                        #   default page)
                        raise Exception('Ho noes, we can not decrypt')
                except:
                    logger.info('Could not decrypt pdf file at "%s"' %
                                f['attachment'])

                    # We generate a simple page to tell the user
                    # we were not able to include this file.
                    text = f['attachment']
                    page = StringIO()
                    my_canvas = canvas.Canvas(page)
                    my_canvas.linkURL(f['attachment'], 0)
                    my_canvas.setFont(font, font_size)
                    my_canvas.drawCentredString(
                        4.0 * inch,
                        8.5 * inch,
                        'Could not integrate file at:')
                    my_canvas.drawCentredString(
                        4.0 * inch,
                        8.0 * inch,
                        text)
                    my_canvas.save()

                    pdf = PdfFileReader(page)

            [self.pdf.addPage(pdf.getPage(page_num))
             for page_num in range(pdf.numPages)]

            if (self.pdf.getNumPages() % 2) == 1 and not f == files[-1]:
                self.pdf.addBlankPage()
コード例 #54
0
ファイル: autolink-legacy.py プロジェクト: riceissa/autolink
def get_link_text(url, mime_type, data=None, clean=False):
    '''
    Take URL, MIME type, and optional data to produce the link text.
    '''
    tld = get_tld(url)
    result = "File on " + tld
    if mime_type.startswith("image"):
        result = "Image on " + tld
    elif  "application/pdf" in mime_type:
        logging.debug("PDF detected")
        # I need seek() for some reason so convert from bytes
        data = io.BytesIO(data)
        # fix this later, but I always get a "PdfReadWarning: Xref table
        # not zero-indexed" which should only happen when the -v flag is
        # present
        warnings.filterwarnings("ignore")
        try:
            pdf = PdfFileReader(data, strict=True)
            # PyPDF2 somehow thinks many PDFs are encrypted with the empty
            # string, so deal with that
            if pdf.isEncrypted:
                pdf.decrypt('')
            result = pdf.getDocumentInfo().title
            if not result or result.strip() == "":
                result = "PDF on " + tld
        except PyPDF2.utils.PdfReadError:
            result = "PDF on " + tld
    elif "text/html" in mime_type:
        try:
            soup = BeautifulSoup(data, 'html.parser')
            meta = soup.find_all("meta")
            og_title_lst = []
            twitter_title_lst = []
            meta_title_lst = []
            schema_lst = []
            for i in meta:
                if i.get("property") == "og:title":
                    og_title_lst.append(i.get("content"))
                elif i.get("property") == "twitter:title":
                    twitter_title_lst.append(i.get("content"))
                elif i.get("name") == "title":
                    meta_title_lst.append(i.get("content"))
                elif i.get("itemprop") == "name":
                    schema_lst.append(i.get("content"))
            if og_title_lst:
                logging.debug("found og:title")
                result = og_title_lst[0].strip()
            elif twitter_title_lst:
                logging.debug("found twitter title")
                result = twitter_title_lst[0].strip()
            elif meta_title_lst:
                logging.debug("found meta name title")
                result = meta_title_lst[0].strip()
                if clean:
                    result = messy_title_parse(result, url)
            elif schema_lst:
                logging.debug("found schema title")
                result = schema_lst[0].strip()
            elif soup.title and soup.title.string:
                logging.debug("found title tag")
                result = html.unescape(soup.title.string)
                if clean:
                    result = messy_title_parse(result, url)
            else:
                logging.debug("no title found; using default")
                result = "Page on " + tld
        except AttributeError:
            # Probably just empty title when trying to get
            # soup.title.string
            logging.debug("FIXME: this isn't supposed to happen")
            result = "Page on " + tld
    if len(result) > 255:
        result = result[:253] + " …"

    return result
コード例 #55
0
ファイル: filehunter.py プロジェクト: chrismaddalena/viper
    def process_file(self,curr_file):
        """Process the provided file. If the file is a PDF, the PyPDF2 library will be used.
        Otherwise, the extract tool is used, so extract must be installed. This is the one
        piece that requires Linux.

        Parameters:
        curr_file       The filepath of the file to be processed
        """
        date = "None"
        modded = "None"
        author = "None"
        created = "None"
        producer = "None"
        last_saved = "None"
        # Process the current file as a PDF
        if ".pdf" in curr_file:
            try:
                pdf_file = PdfFileReader(open(curr_file,"rb"))
                if pdf_file.getIsEncrypted():
                    pdf_file.decrypt('')
                # getDocumentInfo() returns something like:
                #   {'/Author': 'Chris Maddalena',
                #   '/CreationDate': "D:20131014182824-04'00'",
                #   '/Creator': 'Microsoft® Excel® 2013',1
                #   '/ModDate': "D:20131015141200-04'00'",
                #   '/Producer': 'Microsoft® Excel® 2013'}
                doc_info = pdf_file.getDocumentInfo()
                # If there is no info, just return
                if not doc_info:
                    return
                # Parse the document into
                if "/CreationDate" in doc_info:
                    data = doc_info["/CreationDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    created_time = data[8:10] + ":" + data[10:12]
                    created_time = time.strftime("%I:%M %p",time.strptime(created_time,"%H:%M"))
                    created = date + "/" + year + " " + created_time
                if "/Author" in doc_info:
                    author = doc_info["/Author"]
                if "/Producer" in doc_info:
                    producer = doc_info["/Producer"].strip("(Windows)")
                    producer = re.sub(r'[^\w]',' ',producer)
                    while True:
                        if "  " in producer:
                            producer = producer.replace("  "," ")
                        else:
                            break
                if "/ModDate" in doc_info:
                    data = doc_info["/ModDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    modded_time = data[8:10] + ":" + data[10:12]
                    modded_time = time.strftime("%I:%M %p",time.strptime(modded_time,"%H:%M"))
                    modded = date + "/" + year + " "  + modded_time
                # Strips '/' off filename (if it includes directory name)
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/")+1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\","")
                # Add the document info to the container
                self.container.append([curr_file,created,author,producer,modded,last_saved])
            except Exception:
                return
        # Not a PDF, so treat the current file as an Office doc
        else:
            curr_file = curr_file.replace(" ","\ ").replace("(","\(").replace(")","\)")
            try:
                # Unzip the contents of the document to get the contents of core.xml and app.xml files
                unzipped = zipfile.ZipFile(curr_file)
                doc_xml = lxml.etree.fromstring(unzipped.read("docProps/core.xml"))
                app_xml = lxml.etree.fromstring(unzipped.read("docProps/app.xml"))
                # Namespaces for doc.xml
                dc_ns = {"dc":"http://purl.org/dc/elements/1.1/"}
                cp_ns = {"cp":"http://schemas.openxmlformats.org/package/2006/metadata/core-properties"}
                dcterms_ns = {"dcterms":"http://purl.org/dc/terms/"}
                # Namespaces for app.xml:
                #   app_ns = {"http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
                #   vt_ns = {"vt": "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"}
                #   tags = doc_xml.xpath('//cp:keywords', namespaces=cp_ns)[0].text
                #   description = doc_xml.xpath('//dc:description', namespaces=dc_ns)[0].text
                author = doc_xml.xpath('//dc:creator',namespaces=dc_ns)[0].text
                modded = doc_xml.xpath('//cp:lastModifiedBy',namespaces=cp_ns)[0].text
                created = doc_xml.xpath('//dcterms:created',namespaces=dcterms_ns)[0].text
                last_saved = doc_xml.xpath('//dcterms:modified',namespaces=dcterms_ns)[0].text
                # Convert the created time to a prettier format
                created_date = created.split("T")[0]
                created_time = created.split("T")[1].strip("Z")
                modded_time = time.strftime("%I:%M %p",time.strptime(created_time,"%H:%M:%S"))
                created = created_date + " "  + modded_time
                # Determine the Office application and version that created this document
                for child in app_xml:
                    if 'AppVersion' in child.tag:
                        office_version = child.text
                        if "16." in office_version:
                            version = "2016"
                        elif "15." in office_version:
                            version = "2013"
                        elif "14." in office_version:
                            version = "2010"
                        elif "12." in office_version:
                            version = "2007"
                        if ".xls" in curr_file:
                            producer = "Microsoft Excel " + version
                        elif ".doc" in curr_file:
                            producer = "Microsoft Word " + version
                        elif ".ppt" in curr_file:
                            producer = "Microsoft PowerPoint " + version
                # Remove any slashes in the filename
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/")+1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\","")
                # Add the results to the container
                self.container.append([curr_file,created,author,producer,modded,last_saved])
            except Exception as error:
                click.secho("[!] Failed to extract metadata from {}!".format(curr_file),fg="red")
                click.secho("L.. Details: {}".format(error),fg="red")
                pass
コード例 #56
0
    def tweak(
        self,
        pdfstream,
        skip_sections=0,
        mainsections_count=None,
        reverse_naming=False
    ):
        """
        :param int skip_sections: In order to handle several
            documents in the same file, I introduced skip_sections:
            this tells the parser that previous sections have been handled
            by another parser
        :param int mainsections_count: same purpose as above. If None:
        :param bool reverse_naming : defaults to False.
            for Port-Parallele - outline is reversed
            (analytic code / entr_name)
        """
        self.logger.debug("Writing to {0}".format(self.output_dir))
        mkdir_p(self.output_dir, self.logger)
        filename = pdfstream.name
        with open(filename, 'rb') as duplicate_pdfstream:
            inputpdf = PdfFileReader(duplicate_pdfstream)
            if inputpdf.isEncrypted:
                inputpdf.decrypt('')

            pages_nb = inputpdf.getNumPages()
            if not self.pages_to_process:
                # 0 means no restriction
                self.pages_to_process = pages_nb

            self.logger.info("%s has %d pages", filename, pages_nb)
            self.logger.info(
                "Estimated time for completion of %d pages on "
                "an average computer: %.f seconds. Please stand by while "
                "the parsing takes place.",
                self.pages_to_process,
                self._UNITARY_TIME*self.pages_to_process
                )
            start = time.clock()

            self.register_pages(inputpdf, pages_nb)
            if not self.getdata(
                    inputpdf,
                    filename,
                    pages_nb,
                    skip_sections,
                    mainsections_count,
            ):
                self.logger.critical(
                    "No data could be extracted! "
                    "Not splitting, sorry"
                )
                return

            self.logger.debug("Now writing files")

            did_print = False
            for iteration, printinfo in enumerate(self.split_stream(pages_nb)):
                self.printpages(
                    iteration,
                    *printinfo,
                    reverse_naming=reverse_naming
                )
                did_print = True

            if not did_print:
                self.logger.critical("No page of output!")

            duration = time.clock() - start

            closing_message(self.logger, duration)
コード例 #57
0
ファイル: pdf.py プロジェクト: h4ck3rm1k3/openmedialibrary
def info(pdf):
    data = {}
    with open(pdf, 'rb') as fd:
        try:
            pdfreader = PdfFileReader(fd)
            data['pages'] = pdfreader.numPages
            if pdfreader.getIsEncrypted():
                pdfreader.decrypt('')
            info = pdfreader.getDocumentInfo()
            if info:
                for key in info:
                    if info[key]:
                        try:
                            value = info[key]
                            if len(value) == 1:
                                value = value[0]
                            if isinstance(value, bytes):
                                value = value.decode('utf-16')
                            data[key[1:].lower()] = value
                        except:
                            pass

            xmp = pdfreader.getXmpMetadata()
            if xmp:
                for key in dir(xmp):
                    if key.startswith('dc_'):
                        value = getattr(xmp, key)
                        if isinstance(value, dict) and 'x-default' in value:
                            value = value['x-default']
                        elif isinstance(value, list):
                            value = [v.strip() if isinstance(v, str) else v for v in value if v]
                            value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
                            if len(value) == 1:
                                value = value[0]
                        _key = key[3:]
                        if value and _key not in data:
                            data[_key] = value
        except:
            logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)

    '''
    cmd = ['pdfinfo', pdf]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
    stdout, stderr = p.communicate()
    for line in stdout.strip().split('\n'):
        parts = line.split(':')
        key = parts[0].lower().strip()
        if key:
            data[key] = ':'.join(parts[1:]).strip()
    for key in data.keys():
        if not data[key]:
            del data[key]
    '''
    if 'identifier' in data:
        value = normalize_isbn(data['identifier'])
        if stdnum.isbn.is_valid(value):
            data['isbn'] = [value]
            del data['identifier']
    for key, value in data.items():
        if isinstance(value, dict):
            value = ' '.join(list(value.values()))
            data[key] = value.strip()
    for key in list(data):
        if data[key] in ('Unknown',):
            del data[key]
        if key == 'language':
            data[key] = get_language(data[key])
    text = extract_text(pdf)
    data['textsize'] = len(text)
    if settings.server['extract_text']:
        if not 'isbn' in data:
            isbn = extract_isbn(text)
            if isbn:
                data['isbn'] = [isbn]
    if 'isbn' in data and isinstance(data['isbn'], str):
        data['isbn'] = [data['isbn']]
    if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
        d = data['date']
        data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
    if 'author' in data and isinstance(data['author'], str):
        data['author'] = data['author'].split(', ')
    return data
コード例 #58
0
def main():
   sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
   mypath = u'../../samples'
   min_tokens = 2
   onlypdfFiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
   qualityThreshold = 80
   overallResult = []
   for f in onlypdfFiles:
      print f
      try:
         pdfFile = PdfFileReader(open(mypath + '/' + f, "rb"))
         if pdfFile.isEncrypted:
            pdfFile.decrypt('')
         metainfo = pdfFile.getDocumentInfo()
         res = []
         hit = []
         bestHit = []
         title = None
         author = None
         
         #in case there are some metadata
         if metainfo != None:
            #removing useless (since they are too short) terms         
            if metainfo.title != None:
               title = removeShortTerms([metainfo.title], 5)
               if title == []:
                  title = None
               else:
                  title = title[0]
            if metainfo.author != None:   
               author = removeShortTerms([metainfo.author], 5)
               if author == []:
                  author = None
               else:
                  author = author[0]
                        
            #in case there are author and/or title information in the metadata
            if (author != None or title != None):
               if author != None and title != None:            
                  query = ('title', title),('person', author)
               else:
                  if author != None:
                     query = (('person', author),)
                  if title != None:
                     query = (('title', title),)
               res = searchDataprovider(query)
               if res['hits']['total'] > 0:
                  bestHit = selectBestMatch([q[1] for q in query], res['hits']['hits'], qualityThreshold, creatorWeight=1, titleWeight=1)
                  if bestHit != None:
                     participants = getParticipants(bestHit[1])
                     overallResult.append({'match': True, 'quality': bestHit[0], 'filename': f, 'id': bestHit[1]['id'],
                                           'participants': [a for a in participants if len(participants) > 0], 
                                           'title': bestHit[1]['title']})                  
         
         # when there are no metainfomation available or
         # there where no decent results
         if bestHit == [] or bestHit == None:
            hits = []
            s = pdf_to_txt(mypath + '/' + f, 0, 0)
            paragraphs = re.split(' *\n+ *', s)
            paragraphs = removeShortTerms(paragraphs, 5)
            end = min(5, len(paragraphs))
            for a in range(0, end):
               #search only if there are more than min_tokens words
               if len(paragraphs[a].split()) > min_tokens:
                  res = searchDataprovider((('title', paragraphs[a]),))
                  if res != None and res['hits']['total'] > 0:
                     hit = selectBestMatch(paragraphs, res['hits']['hits'], qualityThreshold)
                     if hit != None:
                        bisect.insort_left(hits, hit)
            if len(hits) > 0:
               bestHit = hits[-1]
               #creator, person, contributor etc. unionizen und in authos unterbringen
               participants = getParticipants(bestHit[1])
               overallResult.append({'match': True, 'quality': bestHit[0], 'filename': f, 'id': bestHit[1]['id'],
                                'participants': [a for a in participants if len(participants) > 0], 
                                'title': bestHit[1]['title']})
            else:
               overallResult.append({'match': False, 'reason': 'no match', 'filename': f})
      except (AttributeError, PdfReadError, IOError, AssertionError, KeyError, NotImplementedError, PDFTextExtractionNotAllowed, TypeError) as e:
         print "exception:"
         for arg in e.args:
            print arg
         overallResult.append({'match': False, 'reason': 'exception', 'filename': f})
   print("done. Results:")
   for i, r in enumerate(overallResult):
      if r['match'] == True:
         print str(i) + '.' + ' match: True' + '\n' \
               '    quality: ' + str(r['quality']) + '\n' + \
               '    filename. ' + r['filename'] + '\n' + \
               '    id: ' + r['id'] + '\n' + '    title: ' + r['title']
         for p in r['participants']:
            print '    participant: ' + p
      else:
         print str(i) + '.' + ' match: False' + '\n' + \
               '   reason: ' + r['reason'] + '\n' + \
               '   filename: ' + r['filename']
コード例 #59
0
ファイル: pdf2.py プロジェクト: Aholicz/Python-100-Days
"""
读取PDF文件

Version: 0.1
Author: 骆昊
Date: 2018-03-26
"""

from PyPDF2 import PdfFileReader

with open('./res/Python课程大纲.pdf', 'rb') as f:
    reader = PdfFileReader(f, strict=False)
    print(reader.numPages)
    if reader.isEncrypted:
        reader.decrypt('')
    current_page = reader.getPage(5)
    print(current_page)
    print(current_page.extractText())