Beispiel #1
0
def findStrFromBox2(anno,box,filename,pheight,verbose=True):
    '''Locate and extract strings from a page layout obj

    Extract text using pdftotext

    Update time: 2018-07-30 09:48:38.
    '''


    texts=u''
    num=0
    # pdftotext requires int coordinates, scale default dpi of
    # pdftotext (72) to 720, and multiply coordinates by 10.
    coord2str=lambda x: int(round(10.*x))  
    #----------Create a dummy LTTextLine obj----------
    dummy=LTTextLine([1,2,3,4])
    
    #----------------Loop through annos----------------
    for ii,hii in enumerate(anno):

        hiibox=hii['rect']
        dummy.set_bbox(hiibox)   #Needs this step

        if box.is_hoverlap(dummy) and box.is_voverlap(dummy):
            textii=[]
            num+=1

            lines=sortY(box._objs)

            #----------------Loop through lines----------------
            for lineii in lines:
                if type(lineii)!=LTTextLine and\
                        type(lineii)!=LTTextLineHorizontal:
                    continue
                if lineii.is_hoverlap(dummy) and\
                        lineii.is_voverlap(dummy):

                    #------Call pdftotext and save to a temp file------
                    # NOTE: pdftotext coordinate has origin at top-left.
                    # Coordinates from Mendeley has origin at bottom-left.
                    """
                    args=['pdftotext','-f',hii['page'],'-l',hii['page'],'-r',720,\
                            '-x',coord2str(hiibox[0]),'-y',coord2str(pheight-hiibox[3]),\
                            '-W',coord2str(hiibox[2]-hiibox[0]),'-H',coord2str(hiibox[3]-hiibox[1]),\
                            os.path.abspath(filename),'tmp.txt']
                    """
                    # NOTE: use '-' as the output for pdftotext to direct the 
                    # output to stdout. Quite some speed up. How could I not
                    # notice this before!
                    args=['pdftotext','-f',hii['page'],'-l',hii['page'],\
                            '-r',720,'-x',coord2str(hiibox[0]),'-y',\
                            coord2str(pheight-hiibox[3]),'-W',\
                            coord2str(hiibox[2]-hiibox[0]),'-H',\
                            coord2str(hiibox[3]-hiibox[1]),\
                            os.path.abspath(filename),'-']
                    args=map(str,args)

                    pp=Popen(args,stdout=PIPE,stderr=PIPE)
                    #while pp.poll() !=0:
                        #time.sleep(0.01)
                    #tii=tools.readFile('tmp.txt',False)

                    tii=pp.communicate()[0]
                    tii=tools.deu(tii)
                    textii.append(tii)

                    # break to avoid double sampling. Lines from lineii may
                    # overlap, and may fetch a highlight twice if not break.
                    break
                 
            #----------------Concatenate texts----------------
            textii=u''.join(textii).strip(' ')

            textii=textii.strip('\n')
            textii=textii.replace('\n',' ')

            #---------------Join with next line---------------
            if len(texts)>1 and texts[-1]=='-':
                texts=texts[:-1]
                joiner=u''
            else:
                joiner=u' '

            #---------------Jump---------------
            linegap,chargap=measureGap(lines)
            textii=textii.strip()
            if ii==0 or len(texts)==0:
                texts+=joiner+textii
                lastbox=hiibox
            else:
                #lastbox=anno[ii-1]['rect']
                if checkJump(lastbox, hiibox, lineii,linegap,chargap):
                    textii=u' ...... '+textii 
                    texts+=joiner+textii
                else:
                    texts+=joiner+textii

            lastbox=hiibox
                
    texts=texts.strip()

    #------------------Do some fixes------------------
    if len(texts)>0:
        texts=wordfix.fixWord(texts)

    return texts, num
Beispiel #2
0
def findStrFromBox(anno, box, verbose=True):
    '''Locate and extract strings from a page layout obj

    Extract text using pdfminer
    '''

    texts = u''
    num = 0

    #----------------Loop through annos----------------
    for ii, hii in enumerate(anno):

        #----------Create a dummy LTTextLine obj----------
        hiibox = hii['rect']
        dummy = LTTextLine(hiibox)
        dummy.set_bbox(hiibox)  #Needs this step

        if box.is_hoverlap(dummy) and box.is_voverlap(dummy):
            textii = []
            num += 1

            lines = sortY(box._objs)

            #----------------Loop through lines----------------
            for lineii in lines:
                if type(lineii)!=LTTextLine and\
                        type(lineii)!=LTTextLineHorizontal:
                    continue
                if lineii.is_hoverlap(dummy) and\
                        lineii.is_voverlap(dummy):
                    #chars=sortX(lineii._objs)
                    chars = lineii._objs

                    #----------------Loop through chars----------------
                    for charii in chars:
                        if type(charii) == LTAnno:
                            textii.append(charii.get_text())
                        elif type(charii) == LTChar:
                            if charii.is_hoverlap(dummy) and\
                                    charii.is_voverlap(dummy):
                                textii.append(charii.get_text())

            #----------------Concatenate texts----------------
            textii = u''.join(textii).strip(' ')

            textii = textii.strip('\n')
            textii = textii.replace('\n', ' ')

            #---------------Join with next line---------------
            if len(texts) > 1 and texts[-1] == '-':
                texts = texts[:-1]
                joiner = u''
            else:
                joiner = u' '

            #---------------Jump---------------
            linegap, chargap = measureGap(lines)
            textii = textii.strip()
            if ii == 0 or len(texts) == 0:
                texts += joiner + textii
                lastbox = hiibox
            else:
                #lastbox=anno[ii-1]['rect']
                if checkJump(lastbox, hiibox, lineii, linegap, chargap):
                    textii = u' ...... ' + textii
                    texts += joiner + textii
                else:
                    texts += joiner + textii

            lastbox = hiibox

    texts = texts.strip()
    #------------------Do some fixes------------------
    if len(texts) > 0:
        texts = wordfix.fixWord(texts)

    return texts, num
Beispiel #3
0
def findStrFromBox(anno,box,verbose=True):
    '''Locate and extract strings from a page layout obj

    Extract text using pdfminer
    '''

    texts=u''
    num=0

    #----------------Loop through annos----------------
    for ii,hii in enumerate(anno):

        #----------Create a dummy LTTextLine obj----------
        hiibox=hii['rect']
        dummy=LTTextLine(hiibox)
        dummy.set_bbox(hiibox)   #Needs this step
    
        if box.is_hoverlap(dummy) and box.is_voverlap(dummy):
            textii=[]
            num+=1

            lines=sortY(box._objs)

            #----------------Loop through lines----------------
            for lineii in lines:
                if type(lineii)!=LTTextLine and\
                        type(lineii)!=LTTextLineHorizontal:
                    continue
                if lineii.is_hoverlap(dummy) and\
                        lineii.is_voverlap(dummy):
                    #chars=sortX(lineii._objs)
                    chars=lineii._objs

                    #----------------Loop through chars----------------
                    for charii in chars:
                        if type(charii)==LTAnno:
                            textii.append(charii.get_text())
                        elif type(charii)==LTChar:
                            if charii.is_hoverlap(dummy) and\
                                    charii.is_voverlap(dummy):
                                textii.append(charii.get_text())

            #----------------Concatenate texts----------------
            textii=u''.join(textii).strip(' ')

            textii=textii.strip('\n')
            textii=textii.replace('\n',' ')

            #---------------Join with next line---------------
            if len(texts)>1 and texts[-1]=='-':
                texts=texts[:-1]
                joiner=u''
            else:
                joiner=u' '

            #---------------Jump---------------
            linegap,chargap=measureGap(lines)
            textii=textii.strip()
            if ii==0 or len(texts)==0:
                texts+=joiner+textii
                lastbox=hiibox
            else:
                #lastbox=anno[ii-1]['rect']
                if checkJump(lastbox, hiibox, lineii,linegap,chargap):
                    textii=u' ...... '+textii 
                    texts+=joiner+textii
                else:
                    texts+=joiner+textii

            lastbox=hiibox
                
    texts=texts.strip()
    #------------------Do some fixes------------------
    if len(texts)>0:
        texts=wordfix.fixWord(texts)

    return texts, num
Beispiel #4
0
def findStrFromBox2(anno, box, filename, pheight, verbose=True):
    '''Locate and extract strings from a page layout obj

    Extract text using pdftotext
    '''

    texts = u''
    num = 0
    # pdftotext requires int coordinates, scale default dpi of
    # pdftotext (72) to 720, and multiply coordinates by 10.
    coord2str = lambda x: int(round(10. * x))

    #----------------Loop through annos----------------
    for ii, hii in enumerate(anno):

        #----------Create a dummy LTTextLine obj----------
        hiibox = hii['rect']
        dummy = LTTextLine(hiibox)
        dummy.set_bbox(hiibox)  #Needs this step

        if box.is_hoverlap(dummy) and box.is_voverlap(dummy):
            textii = []
            num += 1

            lines = sortY(box._objs)

            #----------------Loop through lines----------------
            for lineii in lines:
                if type(lineii)!=LTTextLine and\
                        type(lineii)!=LTTextLineHorizontal:
                    continue
                if lineii.is_hoverlap(dummy) and\
                        lineii.is_voverlap(dummy):

                    #------Call pdftotext and same to a temp file------
                    # NOTE: pdftotext coordinate has origin at top-left.
                    # Coordinates from Mendeley has origin at bottom-left.
                    args=['pdftotext','-f',hii['page'],'-l',hii['page'],'-r',720,\
                            '-x',coord2str(hiibox[0]),'-y',coord2str(pheight-hiibox[3]),\
                            '-W',coord2str(hiibox[2]-hiibox[0]),'-H',coord2str(hiibox[3]-hiibox[1]),\
                            os.path.abspath(filename),'tmp.txt']
                    args = map(str, args)

                    pp = Popen(args)
                    while pp.poll() != 0:
                        time.sleep(0.01)

                    tii = tools.readFile('tmp.txt', False)
                    textii.append(tii)

                    # break to avoid double sampling. Lines from lineii may
                    # overlap, and may fetch a highlight twice if not break.
                    break

            #----------------Concatenate texts----------------
            textii = u''.join(textii).strip(' ')

            textii = textii.strip('\n')
            textii = textii.replace('\n', ' ')

            #---------------Join with next line---------------
            if len(texts) > 1 and texts[-1] == '-':
                texts = texts[:-1]
                joiner = u''
            else:
                joiner = u' '

            #---------------Jump---------------
            linegap, chargap = measureGap(lines)
            textii = textii.strip()
            if ii == 0 or len(texts) == 0:
                texts += joiner + textii
                lastbox = hiibox
            else:
                #lastbox=anno[ii-1]['rect']
                if checkJump(lastbox, hiibox, lineii, linegap, chargap):
                    textii = u' ...... ' + textii
                    texts += joiner + textii
                else:
                    texts += joiner + textii

            lastbox = hiibox

    texts = texts.strip()

    #------------------Do some fixes------------------
    if len(texts) > 0:
        texts = wordfix.fixWord(texts)

    return texts, num