Exemple #1
0
 def _parse_pages(self, book_path):
     result = []
     start = time.time()
     with open(book_path, 'rb') as fh:
         input_pdf = PdfFileReader(fh)
         self._log.info('Start processing %s with %s pages...', book_path,
                        input_pdf.getNumPages())
         if input_pdf.flattenedPages is None:
             input_pdf._flatten()
         for page_num, page in enumerate(input_pdf.flattenedPages, start=1):
             result.append({
                 'page_num': str(page_num),
                 'text': page.extractText()
             })
     self._log.info('Finished processing %s in %s seconds.', book_path,
                    time.time() - start)
     return result