Beispiel #1
0
def pdf_split(filename, output_directory):
    # open pdf file
    pdf_file = open(filename, "rb")
    pdf = PdfFileReader(pdf_file)
    
    # list of pdf page filenames
    filenames = []

    # iterate through all pages
    for i in range(0, pdf.getNumPages()):
        # split pages and save files
        outputFileName = os.path.join(output_directory, os.path.basename(filename)[:-4] + '-%d.pdf' % (i + 1))
        output = PdfFileWriter()
        output.addPage(pdf.getPage(i))
        outputStream = open(outputFileName, "wb")
        output.write(outputStream)
        outputStream.close()
        
        filenames.append(outputFileName)
    
    return filenames
Beispiel #2
0
pdf_filename = os.path.join(output_folder, "file.pdf")

# convert to image set
print "generating screenshots..."
os.system(
    # "%PROGRAMFILES%\\ImageMagick-6.7.0-Q16\\convert.exe \"%s\" -alpha off -resize 500x500 -quality 70 \"%s\\scr.jpg\"" % (
    "convert \"%s\" -alpha off -resize 500x500 -quality 70 \"%s\\scr.jpg\"" % (
        pdf_filename,
        output_folder
        )
    )
print "screenshots generated"

# open pdf file
pdf_file = open(pdf_filename, "rb")
pdf = PdfFileReader(pdf_file)

# iterate through all pages
for i in range(0, pdf.getNumPages()):
    # split pages and save files
    outputFileName = pdf_filename.replace('.pdf', '-%d.pdf' % i)
    output = PdfFileWriter()
    output.addPage(pdf.getPage(i))
    outputStream = open(outputFileName, "wb")
    output.write(outputStream)
    outputStream.close()
    
    # convert to text and save into a file
    convert_pdf(outputFileName)
    translate_txt(outputFileName.replace('.pdf', '.txt'), 'libs/chars_ayna.xml')