def file_parser(fname, pages=None): if magic.from_file(fname, mime=True) == 'application/pdf': try: result_pages = [] i = 0 d = pdf.Document(fname, quiet=True) for i, p in enumerate(d, start=1): text_array = [] for f in p: for b in f: for l in b: text_array.append(unicode(l.text)) result_pages.append('\n'.join(text_array)) if i >= pages: # break after x pages break log.debug("Processed %i pages (%i max)", i, pages) return result_pages except: # reraise everything raise else: # This code path is disabled until the Tika service is fixed (see issue 178) # try: # content = parser.from_file(fname)['content'] # return (content or '').encode('UTF-8') # except: # # reraise everything # raise pass
def PDFFastRead(args, session): import pdfparser.poppler as pdf from shutil import move from os import symlink import re import hashlib flowArr = '' fileSrc = session['srcPath'] + session['srcHash'] + '.pdf' document = pdf.Document(fileSrc.encode()) totalPages = document.no_of_pages isbn = '' pattern = '^ISBN\s[\d].*' reCompiled = re.compile(pattern) for pages in document: pageNum = pages.page_no for flow in pages: for block in flow: flowArr += str(block.bbox.as_tuple()) if pageNum < 6 or totalPages - pageNum < 6: for line in block: result = reCompiled.findall(line.text) if not result == []: isbn = str(result[0]).split()[1] contentHash = hashlib.sha1(flowArr.encode('utf-8')).hexdigest() fileDst = session['prepocessedPath'] + contentHash + '.pdf' move(fileSrc, fileDst) symlink(fileDst, fileSrc) session['contentHash'] = contentHash session['isbn'] = isbn return session
def file_parser(fname, pages=None): if magic.from_file(fname, mime=True) == 'application/pdf': try: text_array = [] i = 0 d = pdf.Document(fname) for i, p in enumerate(d, start=1): for f in p: for b in f: for l in b: text_array.append(l.text.encode('UTF-8')) if i >= pages: # break after x pages break log.debug("Processed %i pages (%i max)", i, pages) return '\n'.join(text_array) except: # reraise everything raise else: try: content = parser.from_file(fname)['content'] return (content or '').encode('UTF-8') except: # reraise everything raise
def pdf_convert(src_dir, dst_dir): try: if os.stat(src_dir).st_size == 0: time.sleep(5) if os.stat(src_dir).st_size == 0: return txt = [] file_name = bytes(src_dir, "utf-8") d = pdf.Document(file_name, False) for p in d: page = {'page': []} for f in p: flow = {'flow': []} for b in f: block = {'block': [], 'bbox': b.bbox.as_tuple()} for l in b: block['block'].append({ 'line': l.text, 'bbox': l.bbox.as_tuple() }) flow['flow'].append(block) page['page'].append(flow) txt.append(page) with open(dst_dir, 'w') as f: ujson.dump(txt, f, indent=4, ensure_ascii=False) except Exception as e: logger.error('Impossible convert %s to txt!' % filename)
def __init__(self, path): self.path = path self.buf = [] self.year = None self.__gap = 1 self.df = None doc = pdf.Document(path.encode()) tmp = 0 total = doc.no_of_pages print(path) with ProgressBar() as pbar: for page in doc: pa = [] tmp += 1 pbar.update(int((tmp / (total - 1)) * 100)) for f in page: for bbox in f: for line in bbox: pa.append(line.text) self.buf.append(pa) found = False for j in range(len(self.buf)): for i in self.buf[j]: if '年度报告' in i: self.year = re.sub('[^0-9]', '', i) if self.year.isdigit(): found = True break if found: break if self.year == '': self.year = randint(1, 1000) tmp = self._search_string("公司信息") if len(tmp) != 0: # df = read_pdf(path, pages = str(tmp[0]+1),silent=True, \ df = read_pdf(self.path, pages = str(tmp[0]+1)+'-'+str(tmp[0]+2),silent=True, \ multiple_tables=True, pandas_option={'header':None}) df = pd.concat(df[:]) df.index = range(df.shape[0]) self.df = df.fillna(' ')
def file_parser(fname, pages=None): if magic.from_file(fname, mime=True) == 'application/pdf': try: text_array = [] d = pdf.Document(fname) for i, p in enumerate(d, start=1): for f in p: for b in f: for l in b: text_array.append(l.text.encode('UTF-8')) if i == pages: # break after x pages break print "Processed %i pages" % (i) return '\n'.join(text_array) except Exception as e: print "PDF Parser Exception: ", e else: try: content = parser.from_file(fname)['content'] return (content or '').encode('UTF-8') except Exception as e: print "File Parser Exception: ", e
import pdfparser.poppler as pdf import sys import re MIN_SECTION_HEADER_SIZE = 13 MAX_SECTION_HEADER_SIZE = 14.5 d = pdf.Document(sys.argv[1]) sections = [['', ['']]] last_color = None last_font_size = None # print('No of pages', d.no_of_pages) for p in d: # print('Page', p.page_no, 'size =', p.size) for f in p: # print(' '*1,'Flow') for b in f: # print(' '*2,'Block', 'bbox=', b.bbox.as_tuple()) for l in b: # print(' '*3, l.text.encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% l.bbox.as_tuple()) #assert l.char_fonts.comp_ratio < 1.0 for i in range(len(l.text)): # print(l.text[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% l.char_bboxes[i].as_tuple(),\ # l.char_fonts[i].name, l.char_fonts[i].size, l.char_fonts[i].color,) if MIN_SECTION_HEADER_SIZE < l.char_fonts[ i].size < MAX_SECTION_HEADER_SIZE: sections[-1][0] += l.text[i].encode('UTF-8') elif MIN_SECTION_HEADER_SIZE < last_font_size < MAX_SECTION_HEADER_SIZE: sections.append(['', ['']])
p.add_argument('-f', '--first-page', type=int, help='first page') p.add_argument('-l', '--last-page', type=int, help='first page') p.add_argument('--phys-layout', action='store_true', help='Physical Layout - param for text analysis') p.add_argument( '--fixed-pitch', type=float, default=0.0, help='Fixed pitch - param for text analysis - app. max space size') p.add_argument('-q', '--quiet', action='store_true', help='Silence all output from poppler') args = p.parse_args() d = pdf.Document(args.document, args.phys_layout, args.fixed_pitch, args.quiet) # @UndefinedVariable fp = args.first_page or 1 lp = args.last_page or d.no_of_pages print 'No of pages', d.no_of_pages for p in d: if p.page_no < fp or p.page_no > lp: continue print 'Page', p.page_no, 'size =', p.size for f in p: print ' ' * 1, 'Flow' for b in f: print ' ' * 2, 'Block', 'bbox=', b.bbox.as_tuple() for l in b: print ' ' * 3, l.text.encode( 'UTF-8' ), '(%0.2f, %0.2f, %0.2f, %0.2f)' % l.bbox.as_tuple()