Esempio n. 1
0
def get_file_mimetype(file):
    try:   
        mimeTypeFile = exiftool.ExifToolHelper().get_metadata(file.name)[0]["File:MIMEType"]
        if mimeTypeFile == "application/pdf":
            #Check is PDFA and Version
            with open(file.name, mode="rb") as fileData:
                input = PdfFileReader(fileData, strict=False)
                try:
                    metadata = input.getXmpMetadata()
                    if metadata:
                        pdfa=app.config["PDFA"]
                        nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"])
                        if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]:
                            mimeTypeFile = "application/pdfa"
                except (ExpatError):
                    app.logger.log(logging.WARNING, "File {0} has not well-formed XMP data, could not verify if application/pdf has PDF/A1 DOCINFO.".format(file.name))

        elif mimeTypeFile in app.config["GENERIC_MIMETYPES"]:
            mimeTypeFile = magic.from_file(file.name, mime=True)
            if mimeTypeFile in app.config["GENERIC_MIMETYPES"]:
                with open(file.name, mode="rb") as fileData:
                    documentTypeFile = magic.from_buffer(fileData.read(2048))
                    for (fileMimetype, fileFormat) in itertools.zip_longest(app.config["FILEMIMETYPES"], app.config["FILEFORMATS"]): 
                        if documentTypeFile in fileFormat:
                            mimeTypeFile = fileMimetype
    except (ValueError, PdfReadError):
        mimeTypeFile = "Unknown/Corrupted"
    return mimeTypeFile
Esempio n. 2
0
    def test_PdfReaderJpegImage(self):
        '''
        Test loading and parsing of a file. Extract the image of the file and compare to expected
        textual output. Expected outcome: file loads, image matches expected.
        '''

        with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
            # Load PDF file from file
            ipdf = PdfFileReader(inputfile)

            # Retrieve the text of the image
            with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'),
                      'r') as pdftext_file:
                imagetext = pdftext_file.read()

            ipdf_p0 = ipdf.getPage(0)
            xObject = ipdf_p0['/Resources']['/XObject'].getObject()
            data = xObject['/Im4'].getData()

            # Compare the text of the PDF to a known source
            self.assertEqual(
                binascii.hexlify(data).decode(),
                imagetext,
                msg=
                'PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
                % (imagetext, binascii.hexlify(data).decode()))
Esempio n. 3
0
def extract_information(pdf_path, pagenum):
    testread = ""
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        testread = pdf.getPage(pagenum).extractText().strip()
        #print(pdf.getPage(pagenum).extractText().strip())
        number_of_pages = pdf.getNumPages()

    # txt = f"""
    # Information about {pdf_path}:

    # Author: {information.author}
    # Creator: {information.creator}
    # Producer: {information.producer}
    # Subject: {information.subject}
    # Title: {information.title}
    # Number of pages: {number_of_pages}
    # """
    print(testread)

    # define variables
    s = testread.strip()
    file = "file.mp3"

    # initialize tts, create mp3 and play
    tts = gTTS(s, 'en')
    tts.save(file)
    #os.system("mpg123 " + file)

    return information
Esempio n. 4
0
def PDFMerge(savePath, pdfPath, watermarkPdfPath):
    # pdf파일 불러오기
    pdfFile = open(pdfPath, 'rb')
    pdfReader = PdfFileReader(pdfFile, strict=False)

    # 워터마크 PDF파일 불러오기
    watermarkPdfFile = open(watermarkPdfPath, 'rb')
    watermarkPdf = PdfFileReader(watermarkPdfFile, strict=False).getPage(0)

    pdfWriter = PdfFileWriter()

    #PDF 페이지 수만큼 반복
    for pageNum in range(pdfReader.numPages):

        #페이지를 불러온다
        pageObj = pdfReader.getPage(pageNum)

        #중앙으로 놓기 위해 좌표를 구한다
        x = (pageObj.mediaBox[2] - watermarkPdf.mediaBox[2]) / 2
        y = (pageObj.mediaBox[3] - watermarkPdf.mediaBox[3]) / 2

        # 워터마크페이지와 합친다
        pageObj.mergeTranslatedPage(page2=watermarkPdf,
                                    tx=x,
                                    ty=y,
                                    expand=False)

        #합친걸 저장할 PDF파일에 추가한다
        pdfWriter.addPage(pageObj)

    #저장
    resultFile = open(savePath, 'wb')
    pdfWriter.write(resultFile)
Esempio n. 5
0
def pdf_metadata_save(pdf_file, metadata, substitute_all_metadata = False, make_backup = True):
    if type(make_backup) is str:
        bak_file = make_backup
    else:
        bak_file = os.path.splitext(pdf_file)[0] + ".bak"
    os.rename(pdf_file, bak_file)

    with open(bak_file, 'rb') as fin:
        pdf_in = PdfFileReader(fin)
        writer = PdfFileWriter()

        for page in range(pdf_in.getNumPages()):
            writer.addPage(pdf_in.getPage(page))

        infoDict = writer._info.getObject()

        info = pdf_in.documentInfo
        if not substitute_all_metadata:
            for key in info:
                #infoDict.update({NameObject(key): createStringObject(info[key])})
                infoDict.update({key: info[key]})

        for key in metadata:
            infoDict.update({NameObject('/' + key): createStringObject(str(metadata[key]))})

        with open(pdf_file, 'wb') as fout:
            writer.write(fout)

        if make_backup == False:
            os.unlink(bak_file)
Esempio n. 6
0
def get_reader(filename, password):
    global old_file
    try:
        old_file = open(filename, 'rb')
    except Exception as err:
        print('文件打开失败!' + str(err))
        write_result(filename, '文件打开失败')
        return None

    # 创建读实例

    pdf_reader = PdfFileReader(old_file, strict=False)
    # 解密操作
    if pdf_reader.isEncrypted:
        if password is None:
            print('%s文件被加密,需要密码!' % filename)
            write_result(filename, '文件需要密码')
            return None
        else:
            if pdf_reader.decrypt(password) != 1:
                print('%s密码不正确!' % filename)
                return None
    # if 'old_file' in locals(): # 这句话需要使用字符串格式,否则无法关闭文件
    #     old_file.close()
    return pdf_reader
Esempio n. 7
0
def pdf_mediabox(filename):
    pdf = PdfFileReader(open(filename, 'rb'))
    page = pdf.getPage(0).mediaBox
    width = page.getWidth()
    height = page.getHeight()
    return result_line(filename, round(float(height) * points_to_mm),
                       round(float(width) * points_to_mm))
Esempio n. 8
0
    def split(file):
        """
        Esse método irá separar página por página do arquivo que o usuário escolher e as salvar no 'output directory'
        como novos arquivos pdf. Cada arquivo corresponderá à uma página do documento original.
        :param file: O arquivo escolhido pelo usuário para fazer a separação das páginas
        :return: None
        """
        # Limpando o diretório para evitar duplicidade em arquivos/diretórios
        Splitter.cleanDir()

        # Tratando o nome do arquivo
        file = Merger.toPath(file)

        # Lógica para separação das páginas dos arquivos PDF's e nova nomeclatura para os mesmos
        with open(file, mode='rb') as pdf_file_to_read:
            file_length = PdfFileReader(pdf_file_to_read).numPages

            for page in range(file_length):
                pdf_file = PdfFileReader(pdf_file_to_read)
                current_page = PdfFileWriter()
                current_page.addPage(pdf_file.getPage(page))
                with open(join(Splitter.splitter_dir,
                               f"página_{page + 1}.pdf"),
                          mode='wb') as pdf:
                    current_page.write(pdf)
Esempio n. 9
0
    def test_PdfReaderFileLoad(self):
        '''
        Test loading and parsing of a file. Extract text of the file and compare to expected
        textual output. Expected outcome: file loads, text matches expected.
        '''

        with open(os.path.join(RESOURCE_ROOT, 'crazyones.pdf'),
                  'rb') as inputfile:
            # Load PDF file from file
            ipdf = PdfFileReader(inputfile)
            ipdf_p1 = ipdf.getPage(0)

            # Retrieve the text of the PDF
            with open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'),
                      'rb') as pdftext_file:
                pdftext = pdftext_file.read()

            ipdf_p1_text = ipdf_p1.extractText().replace('\n',
                                                         '').encode('utf-8')

            # Compare the text of the PDF to a known source
            self.assertEqual(
                ipdf_p1_text,
                pdftext,
                msg=
                'PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
                % (pdftext, ipdf_p1_text))
Esempio n. 10
0
def check_file_content(original, converted):
    original_pdf = PdfFileReader(open(original, mode="rb"), strict=False)
    original_page_num = original_pdf.numPages

    with open(converted, mode="rb") as converted_data:
        converted_pdf = PdfFileReader(converted_data, strict=False)
        page = PageObject(converted_data)
        if (page.getContents() is None or original_page_num != converted_pdf.numPages):
            return False
    return True
    def start_Encryption(self):
        global filename

        try:

            if self.filename[0] and self.userPassword.text(
            ) and self.ownerPassword.text():

                pfw = PdfFileWriter()
                pdffile = PdfFileReader(self.filename[0])

                total_pages = pdffile.numPages

                for page in range(total_pages):
                    current_page = pdffile.getPage(page)
                    pfw.addPage(current_page)

                pfw.encrypt(self.userPassword.text(),
                            self.ownerPassword.text())

                file = open(self.filename[0].replace('.pdf', '_encrypted.pdf'),
                            'wb')
                pfw.write(file)
                file.close()

                msg = QMessageBox()
                msg.setWindowTitle('Done')
                msg.setIcon(QMessageBox.Information)
                msg.setText('File encryption done successfully.')
                msg.exec_()

                self.filename = ''

                self.userPassword.setText('')
                self.ownerPassword.setText('')
                self.userPassword.setDisabled(True)
                self.ownerPassword.setDisabled(True)
                self.startEncryption.setDisabled(True)

            else:
                if self.ownerPassword.text() == '':
                    msg = QMessageBox()
                    msg.setWindowTitle('Error')
                    msg.setIcon(QMessageBox.Critical)
                    msg.setText('Owner Password Field is Empty.')
                    msg.exec_()

                if self.userPassword.text() == '':
                    msg = QMessageBox()
                    msg.setWindowTitle('Error')
                    msg.setIcon(QMessageBox.Critical)
                    msg.setText('User Password Field is Empty.')
                    msg.exec_()
        except:
            pass
Esempio n. 12
0
def pdf_metadata_load(pdf_file):
    with open(pdf_file, 'rb') as fin:
        pdf_in = PdfFileReader(fin)
        writer = PdfFileWriter()

        for page in range(pdf_in.getNumPages()):
            writer.addPage(pdf_in.getPage(page))

        infoDict = writer._info.getObject()

    return pdf_in.documentInfo
Esempio n. 13
0
def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
    doc_info = None
    xmp_info = None

    if use_pdf_meta:
        with open(tmp_file_path, 'rb') as f:
            pdf_file = PdfFileReader(f)
            doc_info = pdf_file.getDocumentInfo()
            xmp_info = parse_xmp(pdf_file)

    if xmp_info:
        author = ' & '.join(split_authors(xmp_info['author']))
        title = xmp_info['title']
        subject = xmp_info['subject']
        tags = xmp_info['tags']
        languages = xmp_info['languages']
        publisher = xmp_info['publisher']
    else:
        author = u'Unknown'
        title = ''
        languages = [""]
        publisher = ""
        subject = ""
        tags = ""

    if doc_info:
        if author == '':
            author = ' & '.join(split_authors([doc_info.author])) if doc_info.author else u'Unknown'
        if title == '':
            title = doc_info.title if doc_info.title else original_file_name
        if subject == '':
            subject = doc_info.subject or ""
        if tags == '' and '/Keywords' in doc_info:
            if isinstance(doc_info['/Keywords'], bytes):
                tags = doc_info['/Keywords'].decode('utf-8')
            else:
                tags = doc_info['/Keywords']
    else:
        title = original_file_name

    return BookMeta(
        file_path=tmp_file_path,
        extension=original_file_extension,
        title=title,
        author=author,
        cover=pdf_preview(tmp_file_path, original_file_name),
        description=subject,
        tags=tags,
        series="",
        series_id="",
        languages=','.join(languages),
        publisher=publisher,
        pubdate="",
        identifiers=[])
Esempio n. 14
0
def merge_pdfs(paths, output):
    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            # Add each page to the writer object
            pdf_writer.addPage(pdf_reader.getPage(page))

    # Write out the merged PDF
    with open(output, 'wb') as out:
        pdf_writer.write(out)
Esempio n. 15
0
    def pypdf3(self):
        with open(self.file_name, 'rb') as pdf_in:
            pdf_writer = PdfFileWriter()
            pdf_reader = PdfFileReader(pdf_in)
            for pagenum in range(pdf_reader.numPages):
                page = pdf_reader.getPage(pagenum)
                page.rotateClockwise(self.rotation)
                pdf_writer.addPage(page)

            with open(self.outfn, 'wb') as pdf_out:
                pdf_writer.write(pdf_out)
        return self.outfn
Esempio n. 16
0
def has_PDFA_XMP(file):
    try:
        with open(file, mode="rb") as fileData:
            xmpfile = PdfFileReader(fileData, strict=False)
            metadata = xmpfile.getXmpMetadata()
            if metadata is not None:
                pdfa=app.config["PDFA"]
                nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"])
                if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]:
                    return True
            return False
    except:
        return False
Esempio n. 17
0
def pypdf3_reader(pdf, decrypt=None):
    """
    Retrieve a PdfFileReader object that has been decrypted if a password is specified.

    :param pdf: PDF document to read
    :param decrypt: Owner password to decrypt pdf
    :return: PdfFileReader object
    """
    if decrypt:
        reader = PdfFileReader(pdf)
        reader.decrypt(decrypt)
        return reader
    else:
        return PdfFileReader(pdf)
    def browse_PDF(self):
        try:
            global filename
            self.filename=QtWidgets.QFileDialog.getOpenFileName(None,'Select PDF File','/','PDF File (*.pdf)')

            if self.filename[0] and PdfFileReader(self.filename[0]).getIsEncrypted()==False:
                self.convertPDF.setDisabled(False)
            else:
                if PdfFileReader(self.filename[0]).getIsEncrypted()==False:
                    msg=QMessageBox()
                    msg.setWindowTitle('Error')
                    msg.setText('File curropted or encrypted.')
                    msg.setIcon(3)
                    msg.exec_()
        except:
            pass
Esempio n. 19
0
    def _extract(self, report_path) -> pd.DataFrame:
        num_rows = 11
        num_columns = len(INPUT_COLUMNS)

        pdf = PdfFileReader(str(report_path))
        date = extract_datetime(extract_text(pdf, page=0))
        page, _ = find_table_page(pdf)
        page = self.unknown_age_matcher.sub("unknown", page)
        data_start = page.find("0-9")
        # on 2020-09-28, they wrote floats like "1, 5"
        raw_data = page[data_start:].replace(", ", ",")
        tokens = raw_data.split()
        # In some cases, PyPDF3 doesn't read the token "≥90" (probably a bug),
        # so I insert that manually in case is missing. Couldn't the token in
        # that position be "90" by coincidence? Nope. If "≥90" is missing, the
        # token in that position is the cumulative total of cases with age >= 90
        # which has never been equal to 90 (and never will be).
        if tokens[9 * num_columns] not in {"90", ">90", "≥90"}:
            tokens.insert(9 * num_columns, ">=90")
        rows = []
        for i in range(num_rows):
            start = i * num_columns
            end = start + num_columns
            row_tokens = tokens[start:end]
            try:
                values = convert_values(row_tokens, COLUMN_CONVERTERS)
            except ValueError or TypeError as err:
                logger.debug('Error in row %d: ')
                raise TableExtractionError(
                    f"\nError while converting values of row {i}: {err}.\n"
                    f"Row tokens: {' | '.join(row_tokens)}")
            row = [date, *values]
            rows.append(row)
        report_data = pd.DataFrame(rows, columns=["date", *INPUT_COLUMNS])
        return report_data
Esempio n. 20
0
 def extract(self, report_path) -> pd.DataFrame:
     pdf = PdfFileReader(str(report_path))
     date = extract_datetime(extract_text(pdf, page=0))
     page, _ = find_table_page(pdf)
     page = self.unknown_age_matcher.sub('unknown', page)
     data_start = page.find('0-9')
     raw_data = page[data_start:]
     raw_data = raw_data.replace(
         ', ', ',')  # from 28/09, they write "1,5" as "1, 5"
     tokens = raw_data.split(' ')
     num_rows = 11
     num_columns = len(INPUT_COLUMNS)
     rows = []
     for i in range(num_rows):
         data_start = i * num_columns
         end = data_start + num_columns
         values = convert_values(tokens[data_start:end], COLUMN_CONVERTERS)
         row = [date, *values]
         rows.append(row)
     report_data = pd.DataFrame(rows, columns=['date', *INPUT_COLUMNS])
     report_data = normalize_table(report_data)
     output_data = compute_derived_columns(report_data)
     check_recomputed_columns_match_extracted_ones(  # sanity check
         extracted=report_data, recomputed=output_data)
     return output_data
Esempio n. 21
0
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(BytesIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo() or {}
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result
def split_pdf(myfile):
    pdf_in_file = open('/tmp/' + myfile, 'rb')
    inputpdf = PdfFileReader(pdf_in_file)
    pages_no = inputpdf.numPages
    print(pages_no)
    output = PdfFileWriter()
    for i in range(pages_no // 50):
        output.addPage(inputpdf.getPage(i * 50))
        if i * 50 + 1 < inputpdf.numPages:
            output.addPage(inputpdf.getPage(i * 50 + 1))
            print('/tmp/document-page%s.pdf' % i)
        newname = 'document-page%s.pdf' % i
        print(newname)
        with open("/tmp/document-page%s.pdf" % i, "wb") as outputStream:
            output.write(outputStream)
            client.upload_file('/tmp/' + newname, destbucketName,
                               'extracted-pdf/' + newname)
Esempio n. 23
0
def start():
    from PyPDF3 import PdfFileReader
    import glob
    print("Put PDF file in pdfs/")
    print("Which PDF file would you like to read the meta data for?")
    for d in glob.iglob("pdfs/*"):
        if "emptyfile" not in d:
            print(d.replace("pdfs/"))
    ans = str(input("> "))
    if ".pdf" in ans:
        pass
    else:
        ans = ans + ".pdf"
    pdffile = PdfFileReader(file=(ans, 'rb'))
    docInfo = pdffile.getDocumentInfo()
    for metaItem in docInfo:
        print("- " + metaItem + ":" + docInfo[metaItem])
    print("\n")
Esempio n. 24
0
def reorder(input_filename: str, output_filename: str) -> None:
    assert os.path.exists(input_filename)
    assert os.path.exists(output_filename) is False

    input_stream = open(input_filename, 'rb')
    output = PdfFileWriter()
    input_pdf = PdfFileReader(input_stream)

    pages = input_pdf.getNumPages()
    order = _make_sequence(pages)

    for page_number in order:
        page = input_pdf.getPage(page_number)
        output.addPage(page)

    output_stream = open(output_filename, "wb")
    output.write(output_stream)
    input_stream.close()
    output_stream.close()
Esempio n. 25
0
    def pypdf3(self):
        reader = PdfFileReader(self.file_name)
        writer = PdfFileWriter()

        # Number of pages in input document
        page_count = reader.getNumPages()

        for page_number in range(page_count):
            wtrmrk = reader.getPage(page_number)

            page = PageObject.createBlankPage(width=self.target_w,
                                              height=self.target_h)
            page.mergeScaledTranslatedPage(wtrmrk, self.scale, self.margin_x,
                                           self.margin_y)
            writer.addPage(page)

        with open(self.output, "wb") as outputStream:
            writer.write(outputStream)
        return self.output
Esempio n. 26
0
def main():
    if (len(sys.argv) != 3):
        print("usage: python 2-up.py input_file output_file")
        sys.exit(1)
    print("2-up input " + sys.argv[1])
    input1 = PdfFileReader(open(sys.argv[1], "rb"))
    output = PdfFileWriter()
    for iter in range(0, input1.getNumPages() - 1, 2):
        lhs = input1.getPage(iter)
        rhs = input1.getPage(iter + 1)
        lhs.mergeTranslatedPage(rhs, lhs.mediaBox.getUpperRight_x(), 0, True)
        output.addPage(lhs)
        print(str(iter) + " "),
        sys.stdout.flush()

    print("writing " + sys.argv[2])
    outputStream = file(sys.argv[2], "wb")
    output.write(outputStream)
    print("done.")
Esempio n. 27
0
def write_pdf(pdf_obj, destination):
    """
    Write PDF object to file
    :param pdf_obj: PDF object to be written to file
    :param destination: Desintation path
    """
    reader = PdfFileReader(pdf_obj)    # Create new PDF object
    writer = PdfFileWriter()

    page_count = reader.getNumPages()

    # add the "watermark" (which is the new pdf) on the existing page
    for page_number in range(page_count):
        page = reader.getPage(page_number)
        writer.addPage(page)

    # finally, write "output" to a real file
    with open(destination, "wb") as outputStream:
        writer.write(outputStream)
Esempio n. 28
0
    def prepare(self):
        # Process PDF input file to raw text file
        with open(self.inputPath, "rb") as fh:
            reader = PdfFileReader(fh)
            for page in tqdm(range(0, reader.numPages)):
                page_text = reader.getPage(page).extractText()
                print("Reading page", page, "of", reader.getNumPages())
                filename = join_paths("./.TXT", hash(self.inputPath))
                with open(filename, "a") as fh:
                    fh.write(page_text)

        # Cleaning the TEXT file for better processing
        with open(filename, "r") as fh:
            lines = fh.readlines()
            lines = [l.replace("\n", "").replace("\r", "") for l in lines]
        with open(filename, "w") as fh:
            fh.writelines(lines)
            print("Cleaning... => ", filename)
        self.transform(filename)
Esempio n. 29
0
def add_encryption(path, encryptPath, fileDicts):
    pdf_writer = PdfFileWriter()
    for fileName in fileDicts:
        input_pdf = os.path.join(path, fileName)
        output_pdf = os.path.join(encryptPath, fileName)
        pdf_reader = PdfFileReader(input_pdf)

        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

        pdf_writer.encrypt(user_pwd=fileDicts[fileName],
                           owner_pwd=None,
                           use_128bit=True)

        #输出文件已存在便删除
        if os.path.exists(output_pdf):
            os.remove(output_pdf)

        with open(output_pdf, 'wb') as fh:
            pdf_writer.write(fh)
Esempio n. 30
0
def get_reader(filename, password):
    try:
        old_file = open(filename, 'rb')
    except Exception as err:
        print('文件打开失败!' + str(err))
        return None

    # 创建读实例
    pdf_reader = PdfFileReader(old_file, strict=False)

    # 解密操作
    if pdf_reader.isEncrypted:
        if password is None:
            print('%s文件被加密,需要密码!' % filename)
            return None
        else:
            if pdf_reader.decrypt(password) != 1:
                print('%s密码不正确!' % filename)
                return None
    if old_file in locals():
        old_file.close()
    return pdf_reader