Example #1
0
def file_parser(fname, pages=None):
    if magic.from_file(fname, mime=True) == 'application/pdf':
        try:
            result_pages = []
            i = 0
            d = pdf.Document(fname, quiet=True)
            for i, p in enumerate(d, start=1):
                text_array = []
                for f in p:
                    for b in f:
                        for l in b:
                            text_array.append(unicode(l.text))
                result_pages.append('\n'.join(text_array))

                if i >= pages:  # break after x pages
                    break

            log.debug("Processed %i pages (%i max)", i, pages)
            return result_pages
        except:
            # reraise everything
            raise
    else:
        # This code path is disabled until the Tika service is fixed (see issue 178)

        # try:
        #     content = parser.from_file(fname)['content']
        #     return (content or '').encode('UTF-8')
        # except:
        #     # reraise everything
        #     raise
        pass
Example #2
0
def PDFFastRead(args, session):
    import pdfparser.poppler as pdf
    from shutil import move
    from os import symlink
    import re
    import hashlib
    flowArr = ''
    fileSrc = session['srcPath'] + session['srcHash'] + '.pdf'
    document = pdf.Document(fileSrc.encode())
    totalPages = document.no_of_pages
    isbn = ''
    pattern = '^ISBN\s[\d].*'
    reCompiled = re.compile(pattern)

    for pages in document:
        pageNum = pages.page_no
        for flow in pages:
            for block in flow:
                flowArr += str(block.bbox.as_tuple())
                if pageNum < 6 or totalPages - pageNum < 6:
                    for line in block:
                        result = reCompiled.findall(line.text)
                        if not result == []:
                            isbn = str(result[0]).split()[1]
    contentHash = hashlib.sha1(flowArr.encode('utf-8')).hexdigest()
    fileDst = session['prepocessedPath'] + contentHash + '.pdf'
    move(fileSrc, fileDst)
    symlink(fileDst, fileSrc)
    session['contentHash'] = contentHash
    session['isbn'] = isbn

    return session
def file_parser(fname, pages=None):
    if magic.from_file(fname, mime=True) == 'application/pdf':
        try:
            text_array = []
            i = 0
            d = pdf.Document(fname)
            for i, p in enumerate(d, start=1):
                for f in p:
                    for b in f:
                        for l in b:
                            text_array.append(l.text.encode('UTF-8'))

                if i >= pages:  # break after x pages
                    break

            log.debug("Processed %i pages (%i max)", i, pages)
            return '\n'.join(text_array)
        except:
            # reraise everything
            raise
    else:
        try:
            content = parser.from_file(fname)['content']
            return (content or '').encode('UTF-8')
        except:
            # reraise everything
            raise
Example #4
0
def pdf_convert(src_dir, dst_dir):
    try:
        if os.stat(src_dir).st_size == 0:
            time.sleep(5)
            if os.stat(src_dir).st_size == 0:
                return
        txt = []
        file_name = bytes(src_dir, "utf-8")
        d = pdf.Document(file_name, False)
        for p in d:
            page = {'page': []}
            for f in p:
                flow = {'flow': []}
                for b in f:
                    block = {'block': [], 'bbox': b.bbox.as_tuple()}
                    for l in b:
                        block['block'].append({
                            'line': l.text,
                            'bbox': l.bbox.as_tuple()
                        })
                    flow['flow'].append(block)
                page['page'].append(flow)
            txt.append(page)
        with open(dst_dir, 'w') as f:
            ujson.dump(txt, f, indent=4, ensure_ascii=False)
    except Exception as e:
        logger.error('Impossible convert %s to txt!' % filename)
Example #5
0
    def __init__(self, path):
        self.path = path
        self.buf = []
        self.year = None
        self.__gap = 1
        self.df = None

        doc = pdf.Document(path.encode())
        tmp = 0
        total = doc.no_of_pages
        print(path)
        with ProgressBar() as pbar:
            for page in doc:
                pa = []
                tmp += 1
                pbar.update(int((tmp / (total - 1)) * 100))
                for f in page:
                    for bbox in f:
                        for line in bbox:
                            pa.append(line.text)
                self.buf.append(pa)

        found = False
        for j in range(len(self.buf)):
            for i in self.buf[j]:
                if '年度报告' in i:
                    self.year = re.sub('[^0-9]', '', i)
                    if self.year.isdigit():
                        found = True
                        break
            if found:
                break
        if self.year == '':
            self.year = randint(1, 1000)
        tmp = self._search_string("公司信息")
        if len(tmp) != 0:
            #  df = read_pdf(path, pages = str(tmp[0]+1),silent=True, \
            df = read_pdf(self.path, pages = str(tmp[0]+1)+'-'+str(tmp[0]+2),silent=True, \
                multiple_tables=True, pandas_option={'header':None})
            df = pd.concat(df[:])
            df.index = range(df.shape[0])
            self.df = df.fillna(' ')
def file_parser(fname, pages=None):
    if magic.from_file(fname, mime=True) == 'application/pdf':
        try:
            text_array = []
            d = pdf.Document(fname)
            for i, p in enumerate(d, start=1):
                for f in p:
                    for b in f:
                        for l in b:
                            text_array.append(l.text.encode('UTF-8'))

                if i == pages:  # break after x pages
                    break

            print "Processed %i pages" % (i)
            return '\n'.join(text_array)
        except Exception as e:
            print "PDF Parser Exception: ", e
    else:
        try:
            content = parser.from_file(fname)['content']
            return (content or '').encode('UTF-8')
        except Exception as e:
            print "File Parser Exception: ", e
Example #7
0
import pdfparser.poppler as pdf
import sys
import re

MIN_SECTION_HEADER_SIZE = 13
MAX_SECTION_HEADER_SIZE = 14.5

d = pdf.Document(sys.argv[1])

sections = [['', ['']]]

last_color = None
last_font_size = None
# print('No of pages', d.no_of_pages)
for p in d:
    # print('Page', p.page_no, 'size =', p.size)
    for f in p:
        # print(' '*1,'Flow')
        for b in f:
            # print(' '*2,'Block', 'bbox=', b.bbox.as_tuple())
            for l in b:
                # print(' '*3, l.text.encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% l.bbox.as_tuple())
                #assert l.char_fonts.comp_ratio < 1.0
                for i in range(len(l.text)):
                    # print(l.text[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% l.char_bboxes[i].as_tuple(),\
                    #   l.char_fonts[i].name, l.char_fonts[i].size, l.char_fonts[i].color,)
                    if MIN_SECTION_HEADER_SIZE < l.char_fonts[
                            i].size < MAX_SECTION_HEADER_SIZE:
                        sections[-1][0] += l.text[i].encode('UTF-8')
                    elif MIN_SECTION_HEADER_SIZE < last_font_size < MAX_SECTION_HEADER_SIZE:
                        sections.append(['', ['']])
Example #8
0
p.add_argument('-f', '--first-page', type=int, help='first page')
p.add_argument('-l', '--last-page', type=int, help='first page')
p.add_argument('--phys-layout',
               action='store_true',
               help='Physical Layout - param for text analysis')
p.add_argument(
    '--fixed-pitch',
    type=float,
    default=0.0,
    help='Fixed pitch - param for text analysis - app. max space size')
p.add_argument('-q',
               '--quiet',
               action='store_true',
               help='Silence all output from poppler')
args = p.parse_args()
d = pdf.Document(args.document, args.phys_layout, args.fixed_pitch,
                 args.quiet)  # @UndefinedVariable
fp = args.first_page or 1
lp = args.last_page or d.no_of_pages
print 'No of pages', d.no_of_pages
for p in d:
    if p.page_no < fp or p.page_no > lp:
        continue
    print 'Page', p.page_no, 'size =', p.size
    for f in p:
        print ' ' * 1, 'Flow'
        for b in f:
            print ' ' * 2, 'Block', 'bbox=', b.bbox.as_tuple()
            for l in b:
                print ' ' * 3, l.text.encode(
                    'UTF-8'
                ), '(%0.2f, %0.2f, %0.2f, %0.2f)' % l.bbox.as_tuple()