コード例 #1
0
    def downloadpdf(url):
        try:
            request = requests.get(url, verify=False)
            if request.headers['Content-Type'] == 'text/html':
                return None
        except requests.exceptions.ConnectionError:
            sys.exit(
                "\nThere was an error when trying to connect to the domain. Please confirm if the domain is correctly written.\n"
            )
        try:
            objbyte = BytesIO(request.content)
        except Exception as e:
            print(e)
            return None
        try:
            s_stdout = sys.stdout
            sys.stdout = BytesIO()
            pdf = PdfFileReader(objbyte)
            sys.stdout = s_stdout
        except Exception as e:
            print(e)
            return None
        if pdf.getIsEncrypted() is True:
            try:
                pdf.decrypt('')

            except:
                pdf = Scratcher.handlepdf(request.content)

        return pdf
コード例 #2
0
ファイル: pdflib.py プロジェクト: shi-cong/PYSTUDY
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
    """添加水印
    """
    pdf_output = PdfFileWriter()
    input_stream = open(pdf_file_in, 'rb')
    pdf_input = PdfFileReader(input_stream)

    # PDF文件被加密了
    if pdf_input.getIsEncrypted():
        print('该PDF文件被加密了.')
        # 尝试用空密码解密
        try:
            pdf_input.decrypt('')
        except Exception as e:
            print('尝试用空密码解密失败.')
            return False
        else:
            print('用空密码解密成功.')
    # 获取PDF文件的页数
    page_num = pdf_input.getNumPages()
    # 读入水印pdf文件
    pdf_watermark_input_stream = open(pdf_file_mark, 'rb')
    pdf_watermark = PdfFileReader(pdf_watermark_input_stream)
    # 给每一页打水印
    for i in range(page_num):
        page = pdf_input.getPage(i)
        page.mergePage(pdf_watermark.getPage(0))
        page.compressContentStreams()  # 压缩内容
        pdf_output.addPage(page)
    output_stream = open(pdf_file_out, "wb")
    pdf_output.write(output_stream)
    input_stream.close()
    pdf_watermark_input_stream.close()
    output_stream.close()
コード例 #3
0
ファイル: pdf.py プロジェクト: uees/happyWork
def add_watermark(pdf_file_mark, pdf_file_in, pdf_file_out):
    with open(pdf_file_in, 'rb') as fp:
        pdf_input = PdfFileReader(fp)

        # PDF文件被加密了
        if pdf_input.getIsEncrypted():
            print('该PDF文件被加密了.')
            # 尝试用空密码解密
            try:
                pdf_input.decrypt('')
            except Exception:
                print('尝试用空密码解密失败.')
                return False
            else:
                print('用空密码解密成功.')

        # 获取PDF文件的页数
        pageNum = pdf_input.getNumPages()

        with open(pdf_file_mark, 'rb') as mfp:
            pdf_output = PdfFileWriter()
            # 读入水印pdf文件
            pdf_watermark = PdfFileReader(mfp)

            # 给每一页打水印
            for i in range(pageNum):
                page = pdf_input.getPage(i)
                page.mergePage(pdf_watermark.getPage(0))
                page.compressContentStreams()  # 压缩内容
                pdf_output.addPage(page)

            with open(pdf_file_out, 'wb') as wfp:
                pdf_output.write(wfp)
コード例 #4
0
ファイル: scratcher.py プロジェクト: humblepoti/scratcher
 def downloadpdf(url):
     try:
         request = requests.get(url, verify=False)
         if 'Content-Type' in request.headers.keys():
             if request.headers['Content-Type'] == 'text/html':
                 return None
     except requests.exceptions.ConnectionError:
         sys.exit(
             "\nThere was an error when trying to connect to the domain. Please confirm if the domain is "
             "correctly written.\n")
     try:
         objbyte = BytesIO(request.content)
     except Exception as e:
         Scratcher.log(url, e)
         sys.exit(
             "\nThere was an error when trying to convert the content of the response.Please verify the logs to"
             " see the raised error.\n")
     try:
         pdf = PdfFileReader(objbyte)
     except utils.PdfReadError as e:
         Scratcher.log(url, e)
         obje = BytesIO(request.content.strip(b'\x00'))
         try:
             pdf = PdfFileReader(obje)
         except utils.PdfReadError:
             return 2
     if pdf.getIsEncrypted() is True:
         try:
             pdf.decrypt('')
         except:
             pdf = Scratcher.handlepdf(request.content)
     return pdf
コード例 #5
0
ファイル: pdflib.py プロジェクト: imze5z/python_study
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
    """添加水印
    """
    pdf_output = PdfFileWriter()
    input_stream = open(pdf_file_in, 'rb')
    pdf_input = PdfFileReader(input_stream)

    # PDF文件被加密了
    if pdf_input.getIsEncrypted():
        print('该PDF文件被加密了.')
        # 尝试用空密码解密
        try:
            pdf_input.decrypt('')
        except Exception as e:
            print('尝试用空密码解密失败.')
            return False
        else:
            print('用空密码解密成功.')
    # 获取PDF文件的页数
    page_num = pdf_input.getNumPages()
    # 读入水印pdf文件
    pdf_watermark_input_stream = open(pdf_file_mark, 'rb')
    pdf_watermark = PdfFileReader(pdf_watermark_input_stream)
    # 给每一页打水印
    for i in range(page_num):
        page = pdf_input.getPage(i)
        page.mergePage(pdf_watermark.getPage(0))
        page.compressContentStreams()  # 压缩内容
        pdf_output.addPage(page)
    output_stream = open(pdf_file_out, "wb")
    pdf_output.write(output_stream)
    input_stream.close()
    pdf_watermark_input_stream.close()
    output_stream.close()
コード例 #6
0
def _get_pdf_document_info(fp):
    try:
        pdf = PdfFileReader(fp)
        if pdf.getIsEncrypted():
            # Some PDFs are "encrypted" with an empty password: give that a
            # shot...
            if not pdf.decrypt(""):
                return None
        return pdf.getDocumentInfo()
    except FileNotFoundError:
        return None
コード例 #7
0
ファイル: pdf_watermark.py プロジェクト: wcwu/cs_netdisk
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
    pdf_output = PdfFileWriter()
    input_stream = file(pdf_file_in, 'rb')
    pdf_input = PdfFileReader(input_stream, strict=False)

    # PDF文件被加密了
    if pdf_input.getIsEncrypted():
        print '该PDF文件被加密了.'
        # 尝试用空密码解密
        try:
            pdf_input.decrypt('')
        except Exception, e:
            return False
        else:
            print '用空密码解密成功.'
コード例 #8
0
ファイル: gen.py プロジェクト: zhangxj/esbi
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
    pdf_output = PdfFileWriter()
    input_stream = file(pdf_file_in, 'rb')
    pdf_input = PdfFileReader(input_stream)
                                                                               
    # PDF文件被加密了
    if pdf_input.getIsEncrypted():
        print '该PDF文件被加密了.'
        # 尝试用空密码解密
        try:
            pdf_input.decrypt('')
        except Exception, e:
            print '尝试用空密码解密失败.'
            return False
        else:
            print '用空密码解密成功.'
コード例 #9
0
def __Get_info(file_path, plain_log, csv_log, analyzed_files, total_files):
    """
    Get_info(file_path)
        Opens the pdf file for reading.
    Args:
        - file_path: (string) Absolute file path.
        - plain_log: (None | string) Log file in plain text.
        - csv_log: (None | string) Log file in csv format.
    """

    file_name = os.path.basename(file_path)
    file_size = os.path.getsize(file_path)

    encrypted = 'No'

    try:  # Try to open not password encrypted pdf files and pdf files
        # encrypted with a blank password.
        pdf_file = PdfFileReader(file(file_path, 'rb'))
        if pdf_file.getIsEncrypted() is True:
            dec_res = pdf_file.decrypt('')
            if dec_res == 1:
                encrypted = 'Yes'

        #Get and parse metadata
        doc_info = pdf_file.getDocumentInfo()
        title, author, creator, subject, producer, c_date, m_date \
            = __Parse_doc_info(doc_info)

        num_pages = pdf_file.getNumPages()

        #Group info
        pdf_meta = pdf_metadata(file_name, title, author, creator, subject,
                                producer, c_date, m_date, encrypted, num_pages,
                                file_size)

        __Print_metadata(pdf_meta)

        if plain_log:
            Log(file_name, pdf_meta, plain_log, 'txt')
        if csv_log:
            Log(file_name, pdf_meta, f_log_csv, 'csv')

        analyzed_files = analyzed_files + 1

    except Exception, e:
        error = file_name + ' ' + str(e)
        __Print_error(error)
コード例 #10
0
def __Get_info(file_path, plain_log, csv_log, analyzed_files, total_files):
    """
    Get_info(file_path)
        Opens the pdf file for reading.
    Args:
        - file_path: (string) Absolute file path.
        - plain_log: (None | string) Log file in plain text.
        - csv_log: (None | string) Log file in csv format.
    """

    file_name = os.path.basename(file_path)
    file_size = os.path.getsize(file_path)

    encrypted = 'No'

    try:  # Try to open not password encrypted pdf files and pdf files
          # encrypted with a blank password.
        pdf_file = PdfFileReader(file(file_path, 'rb'))
        if pdf_file.getIsEncrypted() is True:
            dec_res = pdf_file.decrypt('')
            if dec_res == 1:
                encrypted = 'Yes'

        #Get and parse metadata
        doc_info = pdf_file.getDocumentInfo()
        title, author, creator, subject, producer, c_date, m_date \
            = __Parse_doc_info(doc_info)

        num_pages = pdf_file.getNumPages()

        #Group info
        pdf_meta = pdf_metadata(file_name, title, author, creator,
                                subject, producer, c_date, m_date,
                                encrypted, num_pages, file_size)

        __Print_metadata(pdf_meta)

        if plain_log:
            Log(file_name, pdf_meta, plain_log, 'txt')
        if csv_log:
            Log(file_name, pdf_meta, f_log_csv, 'csv')

        analyzed_files = analyzed_files + 1

    except Exception, e:
        error = file_name + ' ' + str(e)
        __Print_error(error)
コード例 #11
0
ファイル: tools.py プロジェクト: blawesom/pdf-merger
def merge_files(local_pdfs):

    name = 'merge_{0}_output.pdf'.format(str(time.clock())[2:])
    merged_export = PdfFileMerger()
    for pdfile in local_pdfs:
        filepath = getpath(pdfile, config().get(section='server', option='upload_folder'))
        file_bin = PdfFileReader(file(filepath, 'rb'))
        if file_bin.getIsEncrypted():
            file_bin.decrypt('')

        merged_export.append(fileobj=file_bin)
        os.remove(filepath)
    full_ouput = getpath(name, config().get(section='server', option='upload_folder'))
    with open(full_ouput, 'wb') as output:
        merged_export.write(output)

    return full_ouput
コード例 #12
0
ファイル: yapot.py プロジェクト: thequbit/yapot
def split_pdf(pdf_filename, temp_dir):
    '''
    Split the PDF into n PDFs ( one for each page ).
    '''
    filenames = []
    inputpdf = PdfFileReader(open(pdf_filename, "rb"))
    if inputpdf.getIsEncrypted():
        inputpdf.decrypt('')
    for i in range(inputpdf.numPages):
        output = PdfFileWriter()
        output.addPage(inputpdf.getPage(i))
        filename = os.path.basename(pdf_filename)
        filename = "{0}/{1}-p{2}.pdf".format(temp_dir, filename, i)
        with open(filename, "wb") as outputStream:
            output.write(outputStream)
        filenames.append(filename)

    return filenames
コード例 #13
0
ファイル: yapot_utils.py プロジェクト: thequbit/yapot
def split_pdf(pdf_filename):

    filenames = []
    inputpdf = PdfFileReader(open(pdf_filename, "rb"))
    if inputpdf.getIsEncrypted():
        inputpdf.decrypt('')
    for i in range(inputpdf.numPages):
        output = PdfFileWriter()
        output.addPage(inputpdf.getPage(i))
        directory = os.path.dirname(pdf_filename)
        if directory == '':
            directory = '.'
        filename = os.path.basename(pdf_filename)
        filename = "{0}/{1}-p{2}.pdf".format(directory,filename,i)
        with open(filename, "wb") as outputStream:
            output.write(outputStream)
        filenames.append(filename)

    return filenames
コード例 #14
0
ファイル: helpers.py プロジェクト: J08nY/sec-certs
def extract_pdf_metadata(filepath: Path):
    metadata = dict()

    try:
        metadata['pdf_file_size_bytes'] = filepath.stat().st_size
        with filepath.open('rb') as handle:
            pdf = PdfFileReader(handle)

            metadata['pdf_is_encrypted'] = pdf.getIsEncrypted()
            metadata['pdf_number_of_pages'] = pdf.getNumPages()

            for key, val in pdf.getDocumentInfo().items():
                metadata[key] = str(val)

    except Exception as e:
        error_msg = f'Failed to read metadata of {filepath}, error: {e}'
        logger.error(error_msg)
        return error_msg, None

    return constants.RETURNCODE_OK, metadata
コード例 #15
0
def main():
    parser = OptionParser('usage %prog -F'+'<target_File>'+'-P <password_File>')
    parser.add_option("-F", dest="targetFile", type='string',  
                help="target PDF File")
    parser.add_option("-P", dest="PasswordFile", type='string',  
                help="Password File")
    (options, args) = parser.parse_args()
    if(options.targetFile == None)|(options.PasswordFile == None):
            print parser.usage
            exit(0)
    else:
        pdfFile = options.targetFile
        PasswordFile =  options.PasswordFile
    pdfFileReader = PdfFileReader(file(pdfFile,'rb'))
    if pdfFileReader.getIsEncrypted():
        fp = open(PasswordFile,'r')
        for line in fp.readlines():
            passWord = line.strip('\r').strip('\n')
            if(crackPdf(pdfFile, passWord, pdfFileReader)):
                fp.close()
                return True
        fp.close()
    else:
        print '[*] PDF File '+pdfFile+' no encrypted!'
コード例 #16
0
def print_pdf(file_full_path):
    # Header with file path
    cprint("[+] Metadata for file: %s " % (file_full_path),
           "green",
           attrs=['bold'])
    # Open the file
    pdf_file = PdfFileReader(file(file_full_path, 'rb'))
    # Create a dictorionary with the info
    pdf_info = pdf_file.getDocumentInfo()
    # Print metadata
    if pdf_info:
        for metaItem in pdf_info:
            try:
                cprint('\t ' + metaItem[1:] + ': ', 'cyan', end="")
                cprint(pdf_info[metaItem])
            except TypeError:
                cprint(
                    '\t ' + metaItem[1:] + ': ' + 'Error - Item not redeable',
                    'red')
    else:
        cprint('Not data found', 'red')
    # Print other info
    cprint("\t Number of pages: %s" % pdf_file.getNumPages(), 'cyan')
    cprint("\t Is Encripted: %s" % pdf_file.getIsEncrypted(), 'cyan')
コード例 #17
0
    def process_file(self, curr_file):
        """Function to process the provided file. If the file is a PDF, the PyPDF2 library will be
        used. Otherwise, the extract tool is used, so extract must be installed. This is the one
        piece that requires Linux.
        """
        global ED_FROM
        author = '-'
        date = '-'
        generator = '-'
        created = '-'
        producer = '-'
        modded = '-'
        last_saved = '-'
        if ".pdf" in curr_file:
            try:
                pdf_file = PdfFileReader(open(curr_file, 'rb'))
                if pdf_file.getIsEncrypted():
                    pdf_file.decrypt('')
                doc_info = pdf_file.getDocumentInfo()
                if not doc_info:
                    return
                last_saved = '-'
                # Looks at the entire dictionary to parse for information
                if "/CreationDate" in doc_info:
                    data = doc_info["/CreationDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    created_time = data[8:10] + ":" + data[10:12]
                    created_time = time.strftime(
                        "%I:%M %p", time.strptime(created_time, "%H:%M"))
                    created = date + "/" + year + " " + created_time

                if "/Author" in doc_info:
                    author = doc_info["/Author"] + " "
                    if len(author) <= 1:
                        author = "-"

                if "/Producer" in doc_info:
                    producer = doc_info["/Producer"].strip("(Windows)")
                    producer = re.sub(r'[^\w]', ' ', producer)
                    if len(producer) == 0:
                        producer = "-"
                    while True:
                        if "  " in producer:
                            producer = producer.replace("  ", " ")
                        else:
                            break

                if "/ModDate" in doc_info:
                    data = doc_info["/ModDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    modded_time = data[8:10] + ":" + data[10:12]
                    modded_time = time.strftime(
                        "%I:%M %p", time.strptime(modded_time, "%H:%M"))
                    modded = date + "/" + year + " " + modded_time

                # Strips '/' off filename (if it includes directory name)
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/") + 1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\", "")

                if len(producer) > 30:
                    producer = producer[:20] + " [snipped] "
                if len(author) > 20:
                    author = author[:20] + " [snipped] "

                # Appends each piece of information
                # Output will show ONLY if at least ONE file has data in a column
                self.container.append(
                    [curr_file, created, author, producer, modded, last_saved])
            except Exception:
                return
        else:
            try:
                curr_file = curr_file.replace(" ", "\ ").replace("(", "\(")\
                    .replace(")", "\)")

                try:
                    extract_status = subprocess.getstatusoutput("extract")
                except:
                    print(
                        yellow(
                            "[*] We found an Office document, but 'extract' is not installed \
on this system to get the metadata. It is downloaded for later analysis."))

                if extract_status[0] == 0:
                    output = subprocess.check_output("extract -V " + curr_file, shell=True)\
                        .decode('utf-8').split('\n')
                    if "extract: not found" in output[0]:
                        print(
                            red("[!] PyFOCA requires the 'extract' command."))
                        print(
                            red("L.. Please install extract by typing 'apt-get install extract' \
    in terminal."))

                    for i in output:
                        if "creator" in i:
                            author = i[i.find("-") + 2:]
                            rem_alphanumeric = re.compile(r'\W')
                            author = re.sub(rem_alphanumeric, ' ', author)
                            while True:
                                if "  " in author:
                                    author = author.replace("  ", " ")
                                elif author[0] == " ":
                                    author = author[1:]
                                else:
                                    break
                        elif "date" in i and "creation" not in i:
                            year = i[i.find('-') + 2:(i.find('-') + 2) + 4]
                            date = i[i.find(year) + 5:(i.find(year) + 5) +
                                     5].replace("-", "/")
                            modded_time = i[i.find(":") - 2:i.rfind(":") - 1]
                            modded_time = time.strftime(
                                "%I:%M %p",
                                time.strptime(modded_time, "%H:%M"))
                            modded = date + "/" + year + " " + modded_time
                        elif "generator" in i:
                            producer = i[i.find('-') + 2:]
                        elif "creation" in i:
                            year = i[i.find('-') + 2:(i.find('-') + 2) + 4]
                            date = i[i.find(year) + 5:(i.find(year) + 5) +
                                     5].replace("-", "/")
                            created_time = i[i.find(":") - 2:i.rfind(":") - 1]
                            created_time = time.strftime(
                                "%I:%M %p",
                                time.strptime(created_time, "%H:%M"))
                            created = date + "/" + year + " " + created_time
                        elif "last saved" in i:
                            last_saved = i[i.find('-') + 2:]

                    if "/" in curr_file:
                        curr_file = curr_file[curr_file.rfind("/") + 1:]

                    if "\\" in curr_file:
                        curr_file = curr_file.replace("\\", "")

                    if author != "-" or date != "-" or generator != "-" or created != "-" or \
                        producer != "-" or modded != "-" or last_saved != "-":
                        self.container.append([
                            " | " + curr_file, created, author, producer,
                            modded, last_saved
                        ])
                else:
                    print(
                        yellow(
                            "[*] We found an Office document, but 'extract' is not installed \
on this system to get the metadata. It is downloaded for later analysis."))
            except Exception as error:
                if "command not found" in str(error):
                    print(red("[!] PyFOCA requires the 'extract' command."))
                    print(
                        red("L.. Please install on Linux extract by typing 'apt-get install extract' \
in terminal."))
                    # exit()
                return
コード例 #18
0
ファイル: pdf.py プロジェクト: h4ck3rm1k3/openmedialibrary
def info(pdf):
    data = {}
    with open(pdf, 'rb') as fd:
        try:
            pdfreader = PdfFileReader(fd)
            data['pages'] = pdfreader.numPages
            if pdfreader.getIsEncrypted():
                pdfreader.decrypt('')
            info = pdfreader.getDocumentInfo()
            if info:
                for key in info:
                    if info[key]:
                        try:
                            value = info[key]
                            if len(value) == 1:
                                value = value[0]
                            if isinstance(value, bytes):
                                value = value.decode('utf-16')
                            data[key[1:].lower()] = value
                        except:
                            pass

            xmp = pdfreader.getXmpMetadata()
            if xmp:
                for key in dir(xmp):
                    if key.startswith('dc_'):
                        value = getattr(xmp, key)
                        if isinstance(value, dict) and 'x-default' in value:
                            value = value['x-default']
                        elif isinstance(value, list):
                            value = [
                                v.strip() if isinstance(v, str) else v
                                for v in value if v
                            ]
                            value = [
                                v.strftime('%Y-%m-%d') if isinstance(
                                    v, datetime) else v for v in value
                            ]
                            if len(value) == 1:
                                value = value[0]
                        _key = key[3:]
                        if value and _key not in data:
                            data[_key] = value
        except:
            logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)
    '''
    cmd = ['pdfinfo', pdf]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
    stdout, stderr = p.communicate()
    for line in stdout.strip().split('\n'):
        parts = line.split(':')
        key = parts[0].lower().strip()
        if key:
            data[key] = ':'.join(parts[1:]).strip()
    for key in data.keys():
        if not data[key]:
            del data[key]
    '''
    if 'identifier' in data:
        value = normalize_isbn(data['identifier'])
        if stdnum.isbn.is_valid(value):
            data['isbn'] = [value]
            del data['identifier']
    for key, value in data.items():
        if isinstance(value, dict):
            value = ' '.join(list(value.values()))
            data[key] = value.strip()
    for key in list(data):
        if data[key] in ('Unknown', ):
            del data[key]
        if key == 'language':
            data[key] = get_language(data[key])
    text = extract_text(pdf)
    data['textsize'] = len(text)
    if settings.server['extract_text']:
        if not 'isbn' in data:
            isbn = extract_isbn(text)
            if isbn:
                data['isbn'] = [isbn]
    if 'isbn' in data and isinstance(data['isbn'], str):
        data['isbn'] = [data['isbn']]
    if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
        d = data['date']
        data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
    if 'author' in data and isinstance(data['author'], str):
        data['author'] = data['author'].split(', ')
    return data
コード例 #19
0
ファイル: filehunter.py プロジェクト: chrismaddalena/viper
    def process_file(self,curr_file):
        """Process the provided file. If the file is a PDF, the PyPDF2 library will be used.
        Otherwise, the extract tool is used, so extract must be installed. This is the one
        piece that requires Linux.

        Parameters:
        curr_file       The filepath of the file to be processed
        """
        date = "None"
        modded = "None"
        author = "None"
        created = "None"
        producer = "None"
        last_saved = "None"
        # Process the current file as a PDF
        if ".pdf" in curr_file:
            try:
                pdf_file = PdfFileReader(open(curr_file,"rb"))
                if pdf_file.getIsEncrypted():
                    pdf_file.decrypt('')
                # getDocumentInfo() returns something like:
                #   {'/Author': 'Chris Maddalena',
                #   '/CreationDate': "D:20131014182824-04'00'",
                #   '/Creator': 'Microsoft® Excel® 2013',1
                #   '/ModDate': "D:20131015141200-04'00'",
                #   '/Producer': 'Microsoft® Excel® 2013'}
                doc_info = pdf_file.getDocumentInfo()
                # If there is no info, just return
                if not doc_info:
                    return
                # Parse the document into
                if "/CreationDate" in doc_info:
                    data = doc_info["/CreationDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    created_time = data[8:10] + ":" + data[10:12]
                    created_time = time.strftime("%I:%M %p",time.strptime(created_time,"%H:%M"))
                    created = date + "/" + year + " " + created_time
                if "/Author" in doc_info:
                    author = doc_info["/Author"]
                if "/Producer" in doc_info:
                    producer = doc_info["/Producer"].strip("(Windows)")
                    producer = re.sub(r'[^\w]',' ',producer)
                    while True:
                        if "  " in producer:
                            producer = producer.replace("  "," ")
                        else:
                            break
                if "/ModDate" in doc_info:
                    data = doc_info["/ModDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    modded_time = data[8:10] + ":" + data[10:12]
                    modded_time = time.strftime("%I:%M %p",time.strptime(modded_time,"%H:%M"))
                    modded = date + "/" + year + " "  + modded_time
                # Strips '/' off filename (if it includes directory name)
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/")+1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\","")
                # Add the document info to the container
                self.container.append([curr_file,created,author,producer,modded,last_saved])
            except Exception:
                return
        # Not a PDF, so treat the current file as an Office doc
        else:
            curr_file = curr_file.replace(" ","\ ").replace("(","\(").replace(")","\)")
            try:
                # Unzip the contents of the document to get the contents of core.xml and app.xml files
                unzipped = zipfile.ZipFile(curr_file)
                doc_xml = lxml.etree.fromstring(unzipped.read("docProps/core.xml"))
                app_xml = lxml.etree.fromstring(unzipped.read("docProps/app.xml"))
                # Namespaces for doc.xml
                dc_ns = {"dc":"http://purl.org/dc/elements/1.1/"}
                cp_ns = {"cp":"http://schemas.openxmlformats.org/package/2006/metadata/core-properties"}
                dcterms_ns = {"dcterms":"http://purl.org/dc/terms/"}
                # Namespaces for app.xml:
                #   app_ns = {"http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
                #   vt_ns = {"vt": "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"}
                #   tags = doc_xml.xpath('//cp:keywords', namespaces=cp_ns)[0].text
                #   description = doc_xml.xpath('//dc:description', namespaces=dc_ns)[0].text
                author = doc_xml.xpath('//dc:creator',namespaces=dc_ns)[0].text
                modded = doc_xml.xpath('//cp:lastModifiedBy',namespaces=cp_ns)[0].text
                created = doc_xml.xpath('//dcterms:created',namespaces=dcterms_ns)[0].text
                last_saved = doc_xml.xpath('//dcterms:modified',namespaces=dcterms_ns)[0].text
                # Convert the created time to a prettier format
                created_date = created.split("T")[0]
                created_time = created.split("T")[1].strip("Z")
                modded_time = time.strftime("%I:%M %p",time.strptime(created_time,"%H:%M:%S"))
                created = created_date + " "  + modded_time
                # Determine the Office application and version that created this document
                for child in app_xml:
                    if 'AppVersion' in child.tag:
                        office_version = child.text
                        if "16." in office_version:
                            version = "2016"
                        elif "15." in office_version:
                            version = "2013"
                        elif "14." in office_version:
                            version = "2010"
                        elif "12." in office_version:
                            version = "2007"
                        if ".xls" in curr_file:
                            producer = "Microsoft Excel " + version
                        elif ".doc" in curr_file:
                            producer = "Microsoft Word " + version
                        elif ".ppt" in curr_file:
                            producer = "Microsoft PowerPoint " + version
                # Remove any slashes in the filename
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/")+1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\","")
                # Add the results to the container
                self.container.append([curr_file,created,author,producer,modded,last_saved])
            except Exception as error:
                click.secho("[!] Failed to extract metadata from {}!".format(curr_file),fg="red")
                click.secho("L.. Details: {}".format(error),fg="red")
                pass
コード例 #20
0
    def processFile(self, curr_file):
        global extractedFrom
        author = '-'
        date = '-'
        generator = '-'
        created = '-'
        producer = '-'
        modded = '-'
        last_saved = '-'
        if ".pdf" in curr_file:
            try:
                pdfFile = PdfFileReader(file(curr_file, 'rb'))
                if pdfFile.getIsEncrypted():
                    pdfFile.decrypt('')
                docInfo = pdfFile.getDocumentInfo()
                if not docInfo:
                    return
                last_saved = '-'
                #looks at the entire dictionary to parse for information
                if "/CreationDate" in docInfo:
                    data = docInfo["/CreationDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    created_time = data[8:10] + ":" + data[10:12]
                    created_time = time.strftime(
                        "%I:%M %p", time.strptime(created_time, "%H:%M"))
                    created = date + "/" + year + " " + created_time
                if "/Author" in docInfo:
                    author = docInfo["/Author"] + " "
                    if len(author) <= 1:
                        author = "-"
                if "/Producer" in docInfo:
                    producer = docInfo["/Producer"].strip("(Windows)")
                    producer = re.sub(r'[^\w]', ' ', producer)
                    if len(producer) == 0:
                        producer = "-"
                    while True:
                        if "  " in producer:
                            producer = producer.replace("  ", " ")
                        else:
                            break
                if "/ModDate" in docInfo:
                    data = docInfo["/ModDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    modded_time = data[8:10] + ":" + data[10:12]
                    modded_time = time.strftime(
                        "%I:%M %p", time.strptime(modded_time, "%H:%M"))
                    modded = date + "/" + year + " " + modded_time

                #strips '/' off file name (if it includes directory name)
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/") + 1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\", "")

                #trim information if it's too long
                if len(curr_file) > 15:  # trims file name
                    curr_file = curr_file[:15] + "..." + curr_file[-13:]
                if len(producer) > 30:
                    producer = producer[:20] + " [snipped] "
                if len(author) > 20:
                    author = author[:20] + " [snipped] "

                #appends each piece of information. output will show ONLY if at least ONE file has data in a column
                self.container.append([
                    " | " + curr_file, created, author, producer, modded,
                    last_saved
                ])
            except Exception, err:
                return
コード例 #21
0
ファイル: filehunter.py プロジェクト: firebitsbr/ODIN-1
    def process_file(self, curr_file):
        """Process the provided file. If the file is a PDF, the PyPDF2 library will be used.
        Otherwise, the extract tool is used, so extract must be installed. This is the one
        piece that requires Linux.

        Parameters:
        curr_file       The filepath of the file to be processed
        """
        date = "None"
        modded = "None"
        author = "None"
        created = "None"
        producer = "None"
        last_saved = "None"
        # Process the current file as a PDF
        if ".pdf" in curr_file:
            try:
                pdf_file = PdfFileReader(open(curr_file, "rb"))
                if pdf_file.getIsEncrypted():
                    pdf_file.decrypt('')
                # getDocumentInfo() returns something like:
                #   {'/Author': 'Chris Maddalena',
                #   '/CreationDate': "D:20131014182824-04'00'",
                #   '/Creator': 'Microsoft® Excel® 2013',1
                #   '/ModDate': "D:20131015141200-04'00'",
                #   '/Producer': 'Microsoft® Excel® 2013'}
                doc_info = pdf_file.getDocumentInfo()
                # If there is no info, just return
                if not doc_info:
                    return
                # Parse the document into
                if "/CreationDate" in doc_info:
                    data = doc_info["/CreationDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    created_time = data[8:10] + ":" + data[10:12]
                    created_time = time.strftime(
                        "%I:%M %p", time.strptime(created_time, "%H:%M"))
                    created = date + "/" + year + " " + created_time
                if "/Author" in doc_info:
                    author = doc_info["/Author"]
                if "/Producer" in doc_info:
                    producer = doc_info["/Producer"].strip("(Windows)")
                    producer = re.sub(r'[^\w]', ' ', producer)
                    while True:
                        if "  " in producer:
                            producer = producer.replace("  ", " ")
                        else:
                            break
                if "/ModDate" in doc_info:
                    data = doc_info["/ModDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    modded_time = data[8:10] + ":" + data[10:12]
                    modded_time = time.strftime(
                        "%I:%M %p", time.strptime(modded_time, "%H:%M"))
                    modded = date + "/" + year + " " + modded_time
                # Strips '/' off filename (if it includes directory name)
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/") + 1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\", "")
                # Add the document info to the container
                self.container.append(
                    [curr_file, created, author, producer, modded, last_saved])
            except Exception:
                return
        # Not a PDF, so treat the current file as an Office doc
        else:
            curr_file = curr_file.replace(" ", "\ ").replace("(",
                                                             "\(").replace(
                                                                 ")", "\)")
            try:
                # Unzip the contents of the document to get the contents of core.xml and app.xml files
                unzipped = zipfile.ZipFile(curr_file)
                doc_xml = lxml.etree.fromstring(
                    unzipped.read("docProps/core.xml"))
                app_xml = lxml.etree.fromstring(
                    unzipped.read("docProps/app.xml"))
                # Namespaces for doc.xml
                dc_ns = {"dc": "http://purl.org/dc/elements/1.1/"}
                cp_ns = {
                    "cp":
                    "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
                }
                dcterms_ns = {"dcterms": "http://purl.org/dc/terms/"}
                # Namespaces for app.xml:
                #   app_ns = {"http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
                #   vt_ns = {"vt": "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"}
                #   tags = doc_xml.xpath('//cp:keywords', namespaces=cp_ns)[0].text
                #   description = doc_xml.xpath('//dc:description', namespaces=dc_ns)[0].text
                author = doc_xml.xpath('//dc:creator',
                                       namespaces=dc_ns)[0].text
                modded = doc_xml.xpath('//cp:lastModifiedBy',
                                       namespaces=cp_ns)[0].text
                created = doc_xml.xpath('//dcterms:created',
                                        namespaces=dcterms_ns)[0].text
                last_saved = doc_xml.xpath('//dcterms:modified',
                                           namespaces=dcterms_ns)[0].text
                # Convert the created time to a prettier format
                created_date = created.split("T")[0]
                created_time = created.split("T")[1].strip("Z")
                modded_time = time.strftime(
                    "%I:%M %p", time.strptime(created_time, "%H:%M:%S"))
                created = created_date + " " + modded_time
                # Determine the Office application and version that created this document
                for child in app_xml:
                    if 'AppVersion' in child.tag:
                        office_version = child.text
                        if "16." in office_version:
                            version = "2016"
                        elif "15." in office_version:
                            version = "2013"
                        elif "14." in office_version:
                            version = "2010"
                        elif "12." in office_version:
                            version = "2007"
                        if ".xls" in curr_file:
                            producer = "Microsoft Excel " + version
                        elif ".doc" in curr_file:
                            producer = "Microsoft Word " + version
                        elif ".ppt" in curr_file:
                            producer = "Microsoft PowerPoint " + version
                # Remove any slashes in the filename
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/") + 1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\", "")
                # Add the results to the container
                self.container.append(
                    [curr_file, created, author, producer, modded, last_saved])
            except Exception as error:
                click.secho("[!] Failed to extract metadata from {}!".format(
                    curr_file),
                            fg="red")
                click.secho("L.. Details: {}".format(error), fg="red")
                pass
コード例 #22
0
    output.close()
    print("Your hot 'n' ready PDF awaits.")
    print(
        "You'll find it this directory with '-reordered' appended to the filename."
    )


if __name__ == "__main__":
    pdf_path = input(
        'Enter the path to your PDF (e.g. "path/to/your/pdf.pdf"):  ').strip()
    if pdf_path[-1] == '/':
        pdf_path = pdf_path[0:-1]
    regex = r"(?P<filename>[\w\-\_]+)(\.|$)"
    title = re.search(regex, pdf_path).groupdict()['filename']
    reader = PdfFileReader(pdf_path)
    if reader.getIsEncrypted():
        try:
            reader.decrypt('')
        except:
            print(
                "This file was detected as encrypted. Attempted to decrypt with empty password, but failed."
            )
            print(
                "If this file is not encrypted with a password, you can try decrypting with QPDF."
            )
            print(
                "Please make sure you have installed QPDF before saying yes to the following prompt..."
            )
            tryWithQPDF = input(
                "Would you like to try decrypting with QPDF?: ")
            if re.match(r"(y|Y|Yes|yes)", tryWithQPDF):
コード例 #23
0
ファイル: PyPDF_002.py プロジェクト: snzolnikov/PDF
#!/usr/bin/python
from PyPDF2 import PdfFileReader

pdf_document = "file.pdf"
with open(pdf_document, "rb") as filehandle:
    pdf = PdfFileReader(filehandle)
    info = pdf.getDocumentInfo()
    pages = pdf.getNumPages()
    print('file information: ', info)
    print("number of pages: %i" % pages)
    page1 = pdf.getPage(0)
    print(pdf.getIsEncrypted())
    print(pdf.pageMode)
    print(pdf.getFields())
    print(pdf.stream)
    print(pdf.flattenedPages)
    print(page1)
    print(page1.extractText())
コード例 #24
0
    def export_to_file(self, file_out, only_selected=False):
        """Export to file"""

        selection = self.iconview.get_selected_items()
        pdf_output = PdfFileWriter()
        pdf_input = []
        for pdfdoc in self.pdfqueue:
            pdfdoc_inp = PdfFileReader(open(pdfdoc.copyname, 'rb'))
            if pdfdoc_inp.getIsEncrypted():
                try:  # Workaround for lp:#355479
                    stat = pdfdoc_inp.decrypt('')
                except:
                    stat = 0
                if stat != 1:
                    errmsg = _(
                        'File %s is encrypted.\n'
                        'Support for encrypted files has not been implemented yet.\n'
                        'File export failed.') % pdfdoc.filename
                    raise Exception(errmsg)
                #FIXME
                #else
                #   ask for password and decrypt file
            pdf_input.append(pdfdoc_inp)

        for row in self.model:

            if only_selected and row.path not in selection:
                continue

            # add pages from input to output document
            nfile = row[2]
            npage = row[3]
            current_page = copy(pdf_input[nfile - 1].getPage(npage - 1))
            angle = row[6]
            angle0 = current_page.get("/Rotate", 0)
            crop = [row[7], row[8], row[9], row[10]]
            if angle != 0:
                current_page.rotateClockwise(angle)
            if crop != [0., 0., 0., 0.]:
                rotate_times = int(round(((angle + angle0) % 360) / 90) % 4)
                crop_init = crop
                if rotate_times != 0:
                    perm = [0, 2, 1, 3]
                    for it in range(rotate_times):
                        perm.append(perm.pop(0))
                    perm.insert(1, perm.pop(2))
                    crop = [crop_init[perm[side]] for side in range(4)]
                #(x1, y1) = current_page.cropBox.lowerLeft
                #(x2, y2) = current_page.cropBox.upperRight
                (x1,
                 y1) = [float(xy) for xy in current_page.mediaBox.lowerLeft]
                (x2,
                 y2) = [float(xy) for xy in current_page.mediaBox.upperRight]
                x1_new = int(x1 + (x2 - x1) * crop[0])
                x2_new = int(x2 - (x2 - x1) * crop[1])
                y1_new = int(y1 + (y2 - y1) * crop[3])
                y2_new = int(y2 - (y2 - y1) * crop[2])
                #current_page.cropBox.lowerLeft = (x1_new, y1_new)
                #current_page.cropBox.upperRight = (x2_new, y2_new)
                current_page.mediaBox.lowerLeft = (x1_new, y1_new)
                current_page.mediaBox.upperRight = (x2_new, y2_new)

            pdf_output.addPage(current_page)

        # finally, write "output" to document-output.pdf
        pdf_output.write(open(file_out, 'wb'))
コード例 #25
0
ファイル: pdf.py プロジェクト: h4ck3rm1k3/openmedialibrary
def info(pdf):
    data = {}
    with open(pdf, 'rb') as fd:
        try:
            pdfreader = PdfFileReader(fd)
            data['pages'] = pdfreader.numPages
            if pdfreader.getIsEncrypted():
                pdfreader.decrypt('')
            info = pdfreader.getDocumentInfo()
            if info:
                for key in info:
                    if info[key]:
                        try:
                            value = info[key]
                            if len(value) == 1:
                                value = value[0]
                            if isinstance(value, bytes):
                                value = value.decode('utf-16')
                            data[key[1:].lower()] = value
                        except:
                            pass

            xmp = pdfreader.getXmpMetadata()
            if xmp:
                for key in dir(xmp):
                    if key.startswith('dc_'):
                        value = getattr(xmp, key)
                        if isinstance(value, dict) and 'x-default' in value:
                            value = value['x-default']
                        elif isinstance(value, list):
                            value = [v.strip() if isinstance(v, str) else v for v in value if v]
                            value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value]
                            if len(value) == 1:
                                value = value[0]
                        _key = key[3:]
                        if value and _key not in data:
                            data[_key] = value
        except:
            logger.debug('FAILED TO PARSE %s', pdf, exc_info=1)

    '''
    cmd = ['pdfinfo', pdf]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
    stdout, stderr = p.communicate()
    for line in stdout.strip().split('\n'):
        parts = line.split(':')
        key = parts[0].lower().strip()
        if key:
            data[key] = ':'.join(parts[1:]).strip()
    for key in data.keys():
        if not data[key]:
            del data[key]
    '''
    if 'identifier' in data:
        value = normalize_isbn(data['identifier'])
        if stdnum.isbn.is_valid(value):
            data['isbn'] = [value]
            del data['identifier']
    for key, value in data.items():
        if isinstance(value, dict):
            value = ' '.join(list(value.values()))
            data[key] = value.strip()
    for key in list(data):
        if data[key] in ('Unknown',):
            del data[key]
        if key == 'language':
            data[key] = get_language(data[key])
    text = extract_text(pdf)
    data['textsize'] = len(text)
    if settings.server['extract_text']:
        if not 'isbn' in data:
            isbn = extract_isbn(text)
            if isbn:
                data['isbn'] = [isbn]
    if 'isbn' in data and isinstance(data['isbn'], str):
        data['isbn'] = [data['isbn']]
    if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
        d = data['date']
        data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])
    if 'author' in data and isinstance(data['author'], str):
        data['author'] = data['author'].split(', ')
    return data