def run(): print "Welcome to yapot!" if len(sys.argv) != 2: print "Usage:\n\n\tpython cli-tool.py <pdf_filename>\n\n" else: pdf_filename = sys.argv[1] #base_page_name = os.path.expanduser(pdf_filename) temp_dir = str(uuid.uuid4()) success, pdf_text = convert_document( pdf_filename = pdf_filename, #base_page_name = base_page_name, resolution = 200, delete_files = False, page_delineation = '\n--------\n', verbose = True, temp_dir = temp_dir, make_thumbs = True, thumb_size = 512, thumb_dir = '{0}/thumbs'.format(temp_dir), ) with open('%s.txt' % pdf_filename, 'w') as f: f.write(pdf_text) print "Done."
def convert_document(filename): """ converts a pdf document to text using yapot """ success, pdf_text = yapot.convert_document( pdf_filename = filename, #resolution = 200, delete_files = False, page_delineation = '\n', verbose = _DEBUG, temp_dir = './.tmp', #make_thumbs = False, #thumb_size = None, #thumb_dir = None, ) if _DEBUG == True: print "PDF Contents:\n\n" print pdf_text print "\n\n" text = None if success and pdf_text.strip() != '': text = pdf_text.strip() text.replace('\r','\n') for i in range(0,3): text = re.sub(' +',' ', text) text = re.sub('\t+',' ', text) text = re.sub('\n+',' ', text) return text
def run(): print "Welcome to yapot!" if len(sys.argv) != 2: print "Usage:\n\n\tpython cli-tool.py <pdf_filename>\n\n" else: pdf_filename = sys.argv[1] base_page_name = os.path.expanduser(pdf_filename) success, pdf_text = convert_document( pdf_filename = pdf_filename, base_page_name = base_page_name, resolution = 200, delete_files = True, page_delineation = '\n--------\n', verbose = True, ) with open('%s.txt' % pdf_filename, 'w') as f: f.write(pdf_text) print "Done."
def view_doc_post(request): resp = {} resp['code'] = 200 resp['status'] = 'Success.' if 'file' in request.POST: #filename = request.POST['file'].filename doc_file = request.POST['file'].file doc_uid = '%s' % uuid.uuid4() filename = '%s/%s.pdf' % (UPLOAD_FOLDER, doc_uid) with open(filename, 'wb') as f: doc_file.seek(0) while True: data = doc_file.read(2<<16) if not data: break f.write(data) if magic.from_file(filename, mime=True) == 'application/pdf': text = yapot.convert_document(filename, resolution=300, pool_count=8) with open('%s/%s.txt' % (UPLOAD_FOLDER, doc_uid), 'w') as f: f.write(text) resp['doc_uid'] = doc_uid resp['text'] = text resp['code'] = 200 resp['status'] = "File uploaded successfully." else: resp['code'] = 415 resp['status'] = "Invalid file type." else: resp['code'] = 400 resp['status'] = "Missing file for upload." return Response(json.dumps(resp), content_type='application/json')
def _convert_document(self, doc): """ doc = { "parent_url": parent_url, "doc_url": doc_url, "scraper_id": "", "scrape_datetime": datetime.datetime.utc(), "converted": False, "convert_datetime": None, "local_filename": "", "link_text": link_text, "document_meta_data": { }, "contents": "", } """ if self.verbose == True: print "Unconverted document found, processing." success = False if True: #try: doc_filename = self._download_document(doc['doc_url']) doc_path = '{0}{1}'.format(self.download_dir, doc_filename) start_time = time.time() success, pdf_text = yapot.convert_document( pdf_filename = doc_path, resolution = self.resolution, delete_files = True, page_delineation = '\n--------\n', verbose = self.verbose, make_thumbs = True, thumb_size = 512, thumb_dir = self.download_dir, thumb_prefix = '{0}_thumb_page_'.format(doc_filename), ) convert_time = time.time() - start_time if self.verbose == True: print "Updating document ..." if success == True: session.update_document( id = doc['_id'], contents = pdf_text, document_meta_data = {}, local_filename = doc_filename, convert_time = convert_time, ) success = True if self.verbose == True: print "Done updating document." #except: # pass if self.verbose == True: print "Done processing document." return success
def _convert_document(self, doc): """ doc = { "parent_url": parent_url, "doc_url": doc_url, "scraper_id": "", "scrape_datetime": datetime.datetime.utc(), "converted": False, "convert_datetime": None, "local_filename": "", "link_text": link_text, "document_meta_data": { }, "contents": "", } """ if self.verbose == True: print "Unconverted document found, processing." success = False if True: #try: doc_filename = self._download_document(doc['doc_url']) doc_path = '{0}{1}'.format(self.download_dir, doc_filename) start_time = time.time() success, pdf_text = yapot.convert_document( pdf_filename=doc_path, resolution=self.resolution, delete_files=True, page_delineation='\n--------\n', verbose=self.verbose, make_thumbs=True, thumb_size=512, thumb_dir=self.download_dir, thumb_prefix='{0}_thumb_page_'.format(doc_filename), ) convert_time = time.time() - start_time if self.verbose == True: print "Updating document ..." if success == True: session.update_document( id=doc['_id'], contents=pdf_text, document_meta_data={}, local_filename=doc_filename, convert_time=convert_time, ) success = True if self.verbose == True: print "Done updating document." #except: # pass if self.verbose == True: print "Done processing document." return success
def ocr(fn): success, text = yapot.convert_document(fn) if success: return text else: return ''