Beispiel #1
0
def splitPdfOnePageEach(file, wayToSaveFiles, sequential=0):
    try:
        nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file))

        nameDirectoryToSave = f"{nameFile}-{sequential}"

        wayBaseToSaveFile = os.path.join(wayToSaveFiles, 'pdfs',
                                         nameDirectoryToSave)
        os.makedirs(wayBaseToSaveFile)

        with open(file, 'rb') as filePdf:
            pdfReader = PyPDF2.PdfFileReader(filePdf)
            countPages = pdfReader.getNumPages()

            for numberPage in range(countPages):
                pageContent = pdfReader.getPage(numberPage)

                pdfWriter = PyPDF2.PdfFileWriter()
                pdfWriter.addPage(pageContent)

                with open(f'{wayBaseToSaveFile}\\{numberPage+1}.pdf',
                          'wb') as newPdfPerPage:
                    pdfWriter.write(newPdfPerPage)
    except Exception as e:
        pass  #print(f'\t - Não foi possível processar o arquivo {file}, provavelmente o PDF está inválido e com erro no momento de abrir!')
def ImageToText(file, wayToSaveFile):
    nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file))
    wayToSave = f"{wayToSaveFile}/{nameFile}.txt"
    wayToSave = open(wayToSave, "w", encoding='utf-8')
    content = ocr.image_to_string(Image.open(file), lang='por')
    wayToSave.write(content)
    wayToSave.close()
Beispiel #3
0
def PDFImgToText(file, wayToSaveFile):
    nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file))
    wayToSave = f"{wayToSaveFile}/{nameFile}.jpg"

    command = f'magick -density 300 "{file}" "{wayToSave}"'
    os.system(command)

    ImageToText(wayToSave, wayToSaveFile)
def PDFToText(file, wayToSaveFile, mode="simple"):
    nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file))
    wayToSave = f"{wayToSaveFile}/{nameFile}.txt"
    try:
        textPdf = ""
        with open(file, 'rb') as filePdf:
            documents = slate.PDF(filePdf)
            for document in documents:
                textPdf += document
            
        if funcoesUteis.treatTextField(textPdf) == "":
            PDFImgToText(file, wayToSaveFile)
        else:
            command = f'{fileDir}/exe/pdftotext64.exe -{mode} "{file}" "{wayToSave}"'
            os.system(command)

    except Exception as ex:
        print(f"Nao foi possivel transformar o arquivo \"{file}\". O erro é: {str(ex)}")