def findStrFromBox2(anno,box,filename,pheight,verbose=True): '''Locate and extract strings from a page layout obj Extract text using pdftotext Update time: 2018-07-30 09:48:38. ''' texts=u'' num=0 # pdftotext requires int coordinates, scale default dpi of # pdftotext (72) to 720, and multiply coordinates by 10. coord2str=lambda x: int(round(10.*x)) #----------Create a dummy LTTextLine obj---------- dummy=LTTextLine([1,2,3,4]) #----------------Loop through annos---------------- for ii,hii in enumerate(anno): hiibox=hii['rect'] dummy.set_bbox(hiibox) #Needs this step if box.is_hoverlap(dummy) and box.is_voverlap(dummy): textii=[] num+=1 lines=sortY(box._objs) #----------------Loop through lines---------------- for lineii in lines: if type(lineii)!=LTTextLine and\ type(lineii)!=LTTextLineHorizontal: continue if lineii.is_hoverlap(dummy) and\ lineii.is_voverlap(dummy): #------Call pdftotext and save to a temp file------ # NOTE: pdftotext coordinate has origin at top-left. # Coordinates from Mendeley has origin at bottom-left. """ args=['pdftotext','-f',hii['page'],'-l',hii['page'],'-r',720,\ '-x',coord2str(hiibox[0]),'-y',coord2str(pheight-hiibox[3]),\ '-W',coord2str(hiibox[2]-hiibox[0]),'-H',coord2str(hiibox[3]-hiibox[1]),\ os.path.abspath(filename),'tmp.txt'] """ # NOTE: use '-' as the output for pdftotext to direct the # output to stdout. Quite some speed up. How could I not # notice this before! args=['pdftotext','-f',hii['page'],'-l',hii['page'],\ '-r',720,'-x',coord2str(hiibox[0]),'-y',\ coord2str(pheight-hiibox[3]),'-W',\ coord2str(hiibox[2]-hiibox[0]),'-H',\ coord2str(hiibox[3]-hiibox[1]),\ os.path.abspath(filename),'-'] args=map(str,args) pp=Popen(args,stdout=PIPE,stderr=PIPE) #while pp.poll() !=0: #time.sleep(0.01) #tii=tools.readFile('tmp.txt',False) tii=pp.communicate()[0] tii=tools.deu(tii) textii.append(tii) # break to avoid double sampling. Lines from lineii may # overlap, and may fetch a highlight twice if not break. break #----------------Concatenate texts---------------- textii=u''.join(textii).strip(' ') textii=textii.strip('\n') textii=textii.replace('\n',' ') #---------------Join with next line--------------- if len(texts)>1 and texts[-1]=='-': texts=texts[:-1] joiner=u'' else: joiner=u' ' #---------------Jump--------------- linegap,chargap=measureGap(lines) textii=textii.strip() if ii==0 or len(texts)==0: texts+=joiner+textii lastbox=hiibox else: #lastbox=anno[ii-1]['rect'] if checkJump(lastbox, hiibox, lineii,linegap,chargap): textii=u' ...... '+textii texts+=joiner+textii else: texts+=joiner+textii lastbox=hiibox texts=texts.strip() #------------------Do some fixes------------------ if len(texts)>0: texts=wordfix.fixWord(texts) return texts, num
def findStrFromBox(anno, box, verbose=True): '''Locate and extract strings from a page layout obj Extract text using pdfminer ''' texts = u'' num = 0 #----------------Loop through annos---------------- for ii, hii in enumerate(anno): #----------Create a dummy LTTextLine obj---------- hiibox = hii['rect'] dummy = LTTextLine(hiibox) dummy.set_bbox(hiibox) #Needs this step if box.is_hoverlap(dummy) and box.is_voverlap(dummy): textii = [] num += 1 lines = sortY(box._objs) #----------------Loop through lines---------------- for lineii in lines: if type(lineii)!=LTTextLine and\ type(lineii)!=LTTextLineHorizontal: continue if lineii.is_hoverlap(dummy) and\ lineii.is_voverlap(dummy): #chars=sortX(lineii._objs) chars = lineii._objs #----------------Loop through chars---------------- for charii in chars: if type(charii) == LTAnno: textii.append(charii.get_text()) elif type(charii) == LTChar: if charii.is_hoverlap(dummy) and\ charii.is_voverlap(dummy): textii.append(charii.get_text()) #----------------Concatenate texts---------------- textii = u''.join(textii).strip(' ') textii = textii.strip('\n') textii = textii.replace('\n', ' ') #---------------Join with next line--------------- if len(texts) > 1 and texts[-1] == '-': texts = texts[:-1] joiner = u'' else: joiner = u' ' #---------------Jump--------------- linegap, chargap = measureGap(lines) textii = textii.strip() if ii == 0 or len(texts) == 0: texts += joiner + textii lastbox = hiibox else: #lastbox=anno[ii-1]['rect'] if checkJump(lastbox, hiibox, lineii, linegap, chargap): textii = u' ...... ' + textii texts += joiner + textii else: texts += joiner + textii lastbox = hiibox texts = texts.strip() #------------------Do some fixes------------------ if len(texts) > 0: texts = wordfix.fixWord(texts) return texts, num
def findStrFromBox(anno,box,verbose=True): '''Locate and extract strings from a page layout obj Extract text using pdfminer ''' texts=u'' num=0 #----------------Loop through annos---------------- for ii,hii in enumerate(anno): #----------Create a dummy LTTextLine obj---------- hiibox=hii['rect'] dummy=LTTextLine(hiibox) dummy.set_bbox(hiibox) #Needs this step if box.is_hoverlap(dummy) and box.is_voverlap(dummy): textii=[] num+=1 lines=sortY(box._objs) #----------------Loop through lines---------------- for lineii in lines: if type(lineii)!=LTTextLine and\ type(lineii)!=LTTextLineHorizontal: continue if lineii.is_hoverlap(dummy) and\ lineii.is_voverlap(dummy): #chars=sortX(lineii._objs) chars=lineii._objs #----------------Loop through chars---------------- for charii in chars: if type(charii)==LTAnno: textii.append(charii.get_text()) elif type(charii)==LTChar: if charii.is_hoverlap(dummy) and\ charii.is_voverlap(dummy): textii.append(charii.get_text()) #----------------Concatenate texts---------------- textii=u''.join(textii).strip(' ') textii=textii.strip('\n') textii=textii.replace('\n',' ') #---------------Join with next line--------------- if len(texts)>1 and texts[-1]=='-': texts=texts[:-1] joiner=u'' else: joiner=u' ' #---------------Jump--------------- linegap,chargap=measureGap(lines) textii=textii.strip() if ii==0 or len(texts)==0: texts+=joiner+textii lastbox=hiibox else: #lastbox=anno[ii-1]['rect'] if checkJump(lastbox, hiibox, lineii,linegap,chargap): textii=u' ...... '+textii texts+=joiner+textii else: texts+=joiner+textii lastbox=hiibox texts=texts.strip() #------------------Do some fixes------------------ if len(texts)>0: texts=wordfix.fixWord(texts) return texts, num
def findStrFromBox2(anno, box, filename, pheight, verbose=True): '''Locate and extract strings from a page layout obj Extract text using pdftotext ''' texts = u'' num = 0 # pdftotext requires int coordinates, scale default dpi of # pdftotext (72) to 720, and multiply coordinates by 10. coord2str = lambda x: int(round(10. * x)) #----------------Loop through annos---------------- for ii, hii in enumerate(anno): #----------Create a dummy LTTextLine obj---------- hiibox = hii['rect'] dummy = LTTextLine(hiibox) dummy.set_bbox(hiibox) #Needs this step if box.is_hoverlap(dummy) and box.is_voverlap(dummy): textii = [] num += 1 lines = sortY(box._objs) #----------------Loop through lines---------------- for lineii in lines: if type(lineii)!=LTTextLine and\ type(lineii)!=LTTextLineHorizontal: continue if lineii.is_hoverlap(dummy) and\ lineii.is_voverlap(dummy): #------Call pdftotext and same to a temp file------ # NOTE: pdftotext coordinate has origin at top-left. # Coordinates from Mendeley has origin at bottom-left. args=['pdftotext','-f',hii['page'],'-l',hii['page'],'-r',720,\ '-x',coord2str(hiibox[0]),'-y',coord2str(pheight-hiibox[3]),\ '-W',coord2str(hiibox[2]-hiibox[0]),'-H',coord2str(hiibox[3]-hiibox[1]),\ os.path.abspath(filename),'tmp.txt'] args = map(str, args) pp = Popen(args) while pp.poll() != 0: time.sleep(0.01) tii = tools.readFile('tmp.txt', False) textii.append(tii) # break to avoid double sampling. Lines from lineii may # overlap, and may fetch a highlight twice if not break. break #----------------Concatenate texts---------------- textii = u''.join(textii).strip(' ') textii = textii.strip('\n') textii = textii.replace('\n', ' ') #---------------Join with next line--------------- if len(texts) > 1 and texts[-1] == '-': texts = texts[:-1] joiner = u'' else: joiner = u' ' #---------------Jump--------------- linegap, chargap = measureGap(lines) textii = textii.strip() if ii == 0 or len(texts) == 0: texts += joiner + textii lastbox = hiibox else: #lastbox=anno[ii-1]['rect'] if checkJump(lastbox, hiibox, lineii, linegap, chargap): textii = u' ...... ' + textii texts += joiner + textii else: texts += joiner + textii lastbox = hiibox texts = texts.strip() #------------------Do some fixes------------------ if len(texts) > 0: texts = wordfix.fixWord(texts) return texts, num