def pdf_split(filename, output_directory): # open pdf file pdf_file = open(filename, "rb") pdf = PdfFileReader(pdf_file) # list of pdf page filenames filenames = [] # iterate through all pages for i in range(0, pdf.getNumPages()): # split pages and save files outputFileName = os.path.join(output_directory, os.path.basename(filename)[:-4] + '-%d.pdf' % (i + 1)) output = PdfFileWriter() output.addPage(pdf.getPage(i)) outputStream = open(outputFileName, "wb") output.write(outputStream) outputStream.close() filenames.append(outputFileName) return filenames
print "generating screenshots..." os.system( # "%PROGRAMFILES%\\ImageMagick-6.7.0-Q16\\convert.exe \"%s\" -alpha off -resize 500x500 -quality 70 \"%s\\scr.jpg\"" % ( "convert \"%s\" -alpha off -resize 500x500 -quality 70 \"%s\\scr.jpg\"" % ( pdf_filename, output_folder ) ) print "screenshots generated" # open pdf file pdf_file = open(pdf_filename, "rb") pdf = PdfFileReader(pdf_file) # iterate through all pages for i in range(0, pdf.getNumPages()): # split pages and save files outputFileName = pdf_filename.replace('.pdf', '-%d.pdf' % i) output = PdfFileWriter() output.addPage(pdf.getPage(i)) outputStream = open(outputFileName, "wb") output.write(outputStream) outputStream.close() # convert to text and save into a file convert_pdf(outputFileName) translate_txt(outputFileName.replace('.pdf', '.txt'), 'libs/chars_ayna.xml') # remove pdf page file # os.remove(outputFileName)