def _get_images_from_pdf(pdf_filename, resolution, verbose, delete_files, temp_dir, make_thumbs, thumb_size, thumb_dir, thumb_prefix, pool_count=1): success = False try: if verbose == True: print "Splitting PDF into multiple pdf's for processing ..." # make sure there is a place to put our temporary pdfs if not os.path.exists(temp_dir): os.makedirs(temp_dir) # make sure if we are going to make thumbs, the folde rexists if make_thumbs == True: if not os.path.exists(thumb_dir): os.makedirs(thumb_dir) # read input pdf inputpdf = PdfFileReader(open(pdf_filename, "rb")) if inputpdf.getIsEncrypted(): inputpdf.decrypt('') if verbose == True: print "Writing out %i pages ..." % inputpdf.numPages # create all of the temporary pdfs for i in xrange(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) #print output.resolvedObjects filename = "{0}/document-page-{1}.pdf".format(temp_dir,i) with open(filename, "wb") as outputStream: output.write(outputStream) __pdf_queue.put(i) if verbose == True: print "Dispatching pdf workers ..." # spin up our workers to convert the pdfs to images #pool_count = 4 pool = Pool() pool.map_async( _pdf_converter_worker, [(x, resolution, verbose, delete_files, temp_dir, make_thumbs, thumb_size, thumb_dir, thumb_prefix) for \ x in range(pool_count)] ) while __pdf_texts.qsize() != inputpdf.numPages: time.sleep(.25) if verbose == True: print "Done converting PDF." success = True except Exception, e: print str(e)
def OCR(self, fn, resolution=300, verbose=False, part=''): i = 1 pdf = PdfFileReader(file(fn, 'rb')) if pdf.getIsEncrypted(): if pdf.decrypt(''): jnk = 0 else: return false pagedata = [] text = '' for p in pdf.pages: if verbose: print ' --- ' + str(i) part = str(part) # Temporary filenames for ImageMagick conversion pgfile = 'tmp-' + part + '-' + str(i) + '.pdf' pgfilejpg = 'tmp-' + part + '-' + str(i) + '.jpg' # Parse this page output = PdfFileWriter() output.addPage(p) outputStream = file(pgfile, 'wb') output.write(outputStream) outputStream.close() # Convert this page to a high-resolution JPEG img = PythonMagick.Image() img.density(str(resolution)) img.read(pgfile) img.write(pgfilejpg) # OCR the converted JPG im = Image.open(pgfilejpg) if (len(im.split()) == 4): r, g, b, a = im.split() im = Image.merge('RGB', (r, g, b)) t = image_to_string(im) # Cleanup os.remove(pgfile) os.remove(pgfilejpg) # Add to data object pagedata.append(OCRPage(i, t, self.OCRCleanup(t))) text += t i += 1 # Produce the output data object result = OCRResult(text, self.OCRCleanup(text), (i - 1), pagedata) return result
def OCR(self, fn, resolution=300, verbose=False, part=''): i = 1 pdf = PdfFileReader(file(fn, 'rb')) if pdf.getIsEncrypted(): if pdf.decrypt(''): jnk = 0 else: return false pagedata = [] text = '' for p in pdf.pages: if verbose: print ' --- ' + str(i) part = str(part) # Temporary filenames for ImageMagick conversion pgfile = 'tmp-' + part + '-' + str(i) + '.pdf' pgfilejpg = 'tmp-' + part + '-' + str(i) + '.jpg' # Parse this page output = PdfFileWriter() output.addPage(p) outputStream = file(pgfile,'wb') output.write(outputStream) outputStream.close() # Convert this page to a high-resolution JPEG img = PythonMagick.Image() img.density(str(resolution)) img.read(pgfile) img.write(pgfilejpg) # OCR the converted JPG im = Image.open(pgfilejpg) if(len(im.split()) == 4): r, g, b, a = im.split() im = Image.merge('RGB', (r,g,b)) t = image_to_string(im) # Cleanup os.remove(pgfile) os.remove(pgfilejpg) # Add to data object pagedata.append(OCRPage(i, t, self.OCRCleanup(t))) text += t i += 1 # Produce the output data object result = OCRResult(text, self.OCRCleanup(text), (i-1), pagedata) return result
class PdfBox(object): ''' Wraps pyPdf utils into a pdf object''' pdfReader = None pdfInfo = None currentpage = 0 extractedPages = {} filepath = "" isencrypted = False password = "" author = "" title = "" subject = "" pages = 0 initialized = False def __init__(self, filepath, password = None): self.filepath = filepath self.pdfReader = PdfFileReader(file(filepath, "rb")) if password: self.password = password if self.initializePdf(self.password): self.pdfInfo = self.pdfReader.getDocumentInfo() self.author = self.pdfInfo.author self.title = self.pdfInfo.title self.pages = self.pdfReader.getNumPages() self.subject = self.pdfInfo.subject self.extractedPages = {} def initializePdf(self, password = None): if self.pdfReader.getIsEncrypted(): self.isencrypted = True if self.pdfReader.decrypt(self.password): self.initialized = True return True else: self.initialized = True return True return False def getPage(self, pagenum): self.currentpage = pagenum if self.extractedPages.has_key(pagenum): return self.extractedPages[pagenum] else: page = self.pdfReader.getPage(pagenum) text = page.extractText() self.extractedPages[pagenum] = text return text
def export_to_file(self, file_out, only_selected=False): """Export to file""" selection = self.iconview.get_selected_items() pdf_output = PdfFileWriter() pdf_input = [] for pdfdoc in self.pdfqueue: pdfdoc_inp = PdfFileReader(open(pdfdoc.copyname, 'rb')) if pdfdoc_inp.getIsEncrypted(): try: # Workaround for lp:#355479 stat = pdfdoc_inp.decrypt('') except: stat = 0 if (stat != 1): errmsg = _( 'File %s is encrypted.\n' 'Support for encrypted files has not been implemented yet.\n' 'File export failed.') % pdfdoc.filename raise Exception(errmsg) #FIXME #else # ask for password and decrypt file pdf_input.append(pdfdoc_inp) for row in self.model: if only_selected and row.path not in selection: continue # add pages from input to output document nfile = row[2] npage = row[3] current_page = copy(pdf_input[nfile - 1].getPage(npage - 1)) angle = row[6] angle0 = current_page.get("/Rotate", 0) crop = [row[7], row[8], row[9], row[10]] if angle != 0: current_page.rotateClockwise(angle) if crop != [0., 0., 0., 0.]: rotate_times = int(round(((angle + angle0) % 360) / 90) % 4) crop_init = crop if rotate_times != 0: perm = [0, 2, 1, 3] for it in range(rotate_times): perm.append(perm.pop(0)) perm.insert(1, perm.pop(2)) crop = [crop_init[perm[side]] for side in range(4)] #(x1, y1) = current_page.cropBox.lowerLeft #(x2, y2) = current_page.cropBox.upperRight (x1, y1) = [float(xy) for xy in current_page.mediaBox.lowerLeft] (x2, y2) = [float(xy) for xy in current_page.mediaBox.upperRight] x1_new = int(x1 + (x2 - x1) * crop[0]) x2_new = int(x2 - (x2 - x1) * crop[1]) y1_new = int(y1 + (y2 - y1) * crop[3]) y2_new = int(y2 - (y2 - y1) * crop[2]) #current_page.cropBox.lowerLeft = (x1_new, y1_new) #current_page.cropBox.upperRight = (x2_new, y2_new) current_page.mediaBox.lowerLeft = (x1_new, y1_new) current_page.mediaBox.upperRight = (x2_new, y2_new) pdf_output.addPage(current_page) # finally, write "output" to document-output.pdf pdf_output.write(open(file_out, 'wb'))
def export_to_file(self, file_out, only_selected=False): """Export to file""" selection = self.iconview.get_selected_items() pdf_output = PdfFileWriter() pdf_input = [] for pdfdoc in self.pdfqueue: pdfdoc_inp = PdfFileReader(file(pdfdoc.copyname, 'rb')) if pdfdoc_inp.getIsEncrypted(): try: # Workaround for lp:#355479 stat = pdfdoc_inp.decrypt('') except: stat = 0 if (stat!=1): errmsg = _('File %s is encrypted.\n' 'Support for encrypted files has not been implemented yet.\n' 'File export failed.') % pdfdoc.filename raise Exception, errmsg #FIXME #else # ask for password and decrypt file pdf_input.append(pdfdoc_inp) for row in self.model: if only_selected and row.path not in selection: continue # add pages from input to output document nfile = row[2] npage = row[3] current_page = copy(pdf_input[nfile-1].getPage(npage-1)) angle = row[6] angle0 = current_page.get("/Rotate",0) crop = [row[7],row[8],row[9],row[10]] if angle != 0: current_page.rotateClockwise(angle) if crop != [0.,0.,0.,0.]: rotate_times = (((angle + angle0) % 360 + 45) / 90) % 4 crop_init = crop if rotate_times != 0: perm = [0,2,1,3] for it in range(rotate_times): perm.append(perm.pop(0)) perm.insert(1,perm.pop(2)) crop = [crop_init[perm[side]] for side in range(4)] #(x1, y1) = current_page.cropBox.lowerLeft #(x2, y2) = current_page.cropBox.upperRight (x1, y1) = [float(xy) for xy in current_page.mediaBox.lowerLeft] (x2, y2) = [float(xy) for xy in current_page.mediaBox.upperRight] x1_new = int(x1 + (x2-x1) * crop[0]) x2_new = int(x2 - (x2-x1) * crop[1]) y1_new = int(y1 + (y2-y1) * crop[3]) y2_new = int(y2 - (y2-y1) * crop[2]) #current_page.cropBox.lowerLeft = (x1_new, y1_new) #current_page.cropBox.upperRight = (x2_new, y2_new) current_page.mediaBox.lowerLeft = (x1_new, y1_new) current_page.mediaBox.upperRight = (x2_new, y2_new) pdf_output.addPage(current_page) # finally, write "output" to document-output.pdf pdf_output.write(file(file_out, 'wb'))
def employer_resume_book_create(request): if request.POST.has_key("resume_book_id") and request.POST["resume_book_id"]: redelivering = True try: resume_book = ResumeBook.objects.get(id=request.POST["resume_book_id"]) except ResumeBook.DoesNotExist: raise Http404("No resume book exists with id of %s" % request.POST["resume_book_id"]) else: redelivering = False try: resume_book, created = ResumeBook.objects.get_or_create(recruiter=request.user.recruiter, delivered=False) except ResumeBook.MultipleObjectsReturned: resume_books = ResumeBook.objects.filter(recruiter=request.user.recruiter, delivered=False) for i, rb in enumerate(resume_books): if i != 0: rb.delete() else: resume_book = rb if redelivering: resume_book_name = resume_book.name else: now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") resume_book_name = "%s_%s" % (str(request.user), now) resume_book.name = resume_book_name resume_book.save() file_path = "%semployer/resumebook/" % (s.MEDIA_ROOT,) if not os.path.exists(file_path): os.makedirs(file_path) if request.POST["delivery_format"] == "separate": # Create the zip file file_name = "%s%s" % (file_path, resume_book_name) output = zipfile.ZipFile(file_name, "w") try: for student in resume_book.students.visible(): resume_file = file("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb") try: name = "%s %s (%s, %s).pdf" % ( student.first_name, student.last_name, student.graduation_year, student.degree_program, ) output.write(resume_file.name, name, zipfile.ZIP_DEFLATED) finally: resume_file.close() finally: output.close() else: output = PdfFileWriter() file_name = "%s%s.pdf" % (file_path, resume_book_name) report_buffer = cStringIO.StringIO() c = Canvas(report_buffer) now = datetime.now() first_line = "Created on %s at %s" % (now.strftime("%m/%d/%Y"), now.strftime("%I:%M %p")) c.drawString(1 * cm, 28.5 * cm, first_line) c.drawString(1 * cm, 28 * cm, str(request.user.recruiter)) c.drawString(1 * cm, 27.5 * cm, str(request.user.recruiter.employer)) c.drawString(16 * cm, 28.5 * cm, "Created using Umeqo") c.drawString(8.5 * cm, 26.5 * cm, "Resume Book Contents") pad_from_top = 0 for num, student in enumerate( resume_book.students.visible().order_by("graduation_year", "first_name", "last_name") ): c.drawString(6.5 * cm, (25.5 - pad_from_top * 0.5) * cm, "%s %s" % (student.first_name, student.last_name)) c.drawString( 12 * cm, (25.5 - pad_from_top * 0.5) * cm, "%s, %s" % (student.graduation_year, student.degree_program) ) pad_from_top += 1 if num == 50: c.showPage() c.save() output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) report_buffer = cStringIO.StringIO() c = Canvas(report_buffer) pad_from_top = 0 c.showPage() c.save() output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) for student in resume_book.students.visible().order_by("graduation_year", "first_name", "last_name"): resume_file = open("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb") resume = PdfFileReader(resume_file) if resume.getIsEncrypted(): resume.decrypt("") for page in range(resume.getNumPages()): output.addPage(resume.getPage(page)) outputStream = file(file_name, "wb") output.write(outputStream) outputStream.close() resume_file.close() resume_book_contents = open(file_name, "rb") resume_book.resume_book.save(file_name, File(resume_book_contents)) resume_book_contents.close() return HttpResponse()
def processFile(self, curr_file): global extractedFrom author = '-' date = '-' generator = '-' created = '-' producer = '-' modded = '-' last_saved = '-' if ".pdf" in curr_file: try: pdfFile = PdfFileReader(file(curr_file, 'rb')) if pdfFile.getIsEncrypted(): pdfFile.decrypt('') docInfo = pdfFile.getDocumentInfo() if not docInfo: return last_saved = '-' #looks at the entire dictionary to parse for information if "/CreationDate" in docInfo: data = docInfo["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] created_time = time.strftime( "%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in docInfo: author = docInfo["/Author"] + " " if len(author) <= 1: author = "-" if "/Producer" in docInfo: producer = docInfo["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]', ' ', producer) if len(producer) == 0: producer = "-" while True: if " " in producer: producer = producer.replace(" ", " ") else: break if "/ModDate" in docInfo: data = docInfo["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime( "%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time #strips '/' off file name (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/") + 1:] if "\\" in curr_file: curr_file = curr_file.replace("\\", "") #trim information if it's too long if len(curr_file) > 15: # trims file name curr_file = curr_file[:15] + "..." + curr_file[-13:] if len(producer) > 30: producer = producer[:20] + " [snipped] " if len(author) > 20: author = author[:20] + " [snipped] " #appends each piece of information. output will show ONLY if at least ONE file has data in a column self.container.append([ " | " + curr_file, created, author, producer, modded, last_saved ]) except Exception, err: return
def employer_resume_book_create(request): if request.POST.has_key("resume_book_id") and request.POST['resume_book_id']: redelivering = True try: resume_book = ResumeBook.objects.get(id=request.POST["resume_book_id"]) except ResumeBook.DoesNotExist: raise Http404("No resume book exists with id of %s" % request.POST["resume_book_id"]) else: redelivering = False try: resume_book, created = ResumeBook.objects.get_or_create(recruiter = request.user.recruiter, delivered=False) except ResumeBook.MultipleObjectsReturned: resume_books = ResumeBook.objects.filter(recruiter=request.user.recruiter, delivered=False) for i, rb in enumerate(resume_books): if i != 0: rb.delete() else: resume_book = rb if redelivering: resume_book_name = resume_book.name else: now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') resume_book_name = "%s_%s" % (str(request.user), now,) resume_book.name = resume_book_name resume_book.save() file_path = "%semployer/resumebook/"% (s.MEDIA_ROOT,) if not os.path.exists(file_path): os.makedirs(file_path) if request.POST['delivery_format'] == 'separate': # Create the zip file file_name = "%s%s" % (file_path, resume_book_name,) output = zipfile.ZipFile(file_name, 'w') try: for student in resume_book.students.visible(): resume_file = file("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb") try: name = "%s %s (%s, %s).pdf" % (student.first_name, student.last_name, student.graduation_year, student.degree_program) output.write(resume_file.name, name, zipfile.ZIP_DEFLATED) finally: resume_file.close() finally: output.close() else: output = PdfFileWriter() file_name = "%s%s.pdf" % (file_path, resume_book_name) report_buffer = cStringIO.StringIO() c = Canvas(report_buffer) now = datetime.now() first_line = "Created on %s at %s" % (now.strftime('%m/%d/%Y'), now.strftime('%I:%M %p')) c.drawString(1*cm, 28.5*cm, first_line) c.drawString(1*cm, 28*cm, str(request.user.recruiter)) c.drawString(1*cm, 27.5*cm, str(request.user.recruiter.employer)) c.drawString(16*cm, 28.5*cm, "Created using Umeqo") c.drawString(8.5*cm, 26.5*cm, "Resume Book Contents") pad_from_top = 0 for num, student in enumerate(resume_book.students.visible().order_by("graduation_year", "first_name", "last_name")): c.drawString(6.5*cm, (25.5-pad_from_top*.5)*cm, "%s %s" % (student.first_name, student.last_name)) c.drawString(12*cm, (25.5-pad_from_top*.5)*cm, "%s, %s" %(student.graduation_year, student.degree_program)) pad_from_top += 1 if num == 50: c.showPage() c.save() output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) report_buffer = cStringIO.StringIO() c = Canvas(report_buffer) pad_from_top = 0 c.showPage() c.save() output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) for student in resume_book.students.visible().order_by("graduation_year", "first_name", "last_name"): resume_file = open("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb") resume = PdfFileReader(resume_file) if resume.getIsEncrypted(): resume.decrypt("") for page in range(resume.getNumPages()): output.addPage(resume.getPage(page)) outputStream = file(file_name, "wb") output.write(outputStream) outputStream.close() resume_file.close() resume_book_contents = open(file_name, "rb") resume_book.resume_book.save(file_name, File(resume_book_contents)) resume_book_contents.close() return HttpResponse()
def processFile(self, curr_file): global extractedFrom author = '-' date = '-' generator = '-' created = '-' producer = '-' modded = '-' last_saved = '-' if ".pdf" in curr_file: try: pdfFile = PdfFileReader(file(curr_file, 'rb')) if pdfFile.getIsEncrypted(): pdfFile.decrypt('') docInfo = pdfFile.getDocumentInfo() if not docInfo: return last_saved = '-' #looks at the entire dictionary to parse for information if "/CreationDate" in docInfo: data = docInfo["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in docInfo: author = docInfo["/Author"] + " " if len(author) <=1: author = "-" if "/Producer" in docInfo: producer = docInfo["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]', ' ', producer) if len(producer) == 0: producer = "-" while True: if " " in producer: producer = producer.replace(" ", " ") else: break if "/ModDate" in docInfo: data = docInfo["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time #strips '/' off file name (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/")+1:] if "\\" in curr_file: curr_file = curr_file.replace("\\","") #trim information if it's too long if len(curr_file) > 15: # trims file name curr_file = curr_file[:15] + "..." + curr_file[-13:] if len(producer) > 30: producer = producer[:20] + " [snipped] " if len(author) > 20: author = author[:20] + " [snipped] " #appends each piece of information. output will show ONLY if at least ONE file has data in a column self.container.append([" | " + curr_file,created,author,producer,modded,last_saved]) except Exception, err: return