def process_pdf(path): """Extract text from PDF file using PDFMiner with whitespace inatact.""" str = "" try: pages = layout_scanner.get_pages(path) i = 0 l = len(pages) while i < l: str += pages[i] i += 1 except Exception, e: return g_error_template % e, ""
def pdf_extract(filename): fp = open('output\pdf.txt', 'wb') pages = layout_scanner.get_pages(filename) # Calculating number of pages for the PDF document length = len(pages) i = 0 # Running the loop and saving text contents for each page while i < length: fp.write(pages[i] + "\n") i = i + 1 fp.close()
def pdf_extract(filename): fp = open('output\pdf.txt', 'wb') pages=layout_scanner.get_pages(filename) # Calculating number of pages for the PDF document length=len(pages) i=0 # Running the loop and saving text contents for each page while i<length: fp.write(pages[i]+"\n") i=i+1 fp.close()
def process_pdf(path): """Extract text from PDF file using PDFMiner with whitespace inatact.""" str = "" try: pages = layout_scanner.get_pages(path) i = 0 l = len(pages) while i < l: str += pages[i] i = i + 1 except: print "Error load pdf" return str
def index_pdf_content(item, url, field_name): if url.endswith('.pdf'): download_pdf(url.replace(' ', '%20')) pdf_file = basename(url).replace(' ', '%20') fp = open('slsascrapy/pdf_folder/' + pdf_file) parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = layout_scanner.get_pages(url) content = ' '.join(convert(pdf_file).split()) item[field_name] += content os.remove('slsascrapy/pdf_folder/' + pdf_file) else: item[field_name] = u'NA'
def convPdf(portfolios_dir, f, verbose=0): pdf_filepath = os.path.join(portfolios_dir, f) toc=layout_scanner.get_toc(pdf_filepath) if verbose > 0: print "TOC -- not currently used for dict" if verbose > 0: for e in toc: print e pages=layout_scanner.get_pages(pdf_filepath) doc_dict = {'doctype': "report", 'origin': "shareworks"} doc_dict['_id'] = f doc_dict['student_email'] = email_from_fname(f) doc_dict['content'] = [] for (count, p) in enumerate(pages): if len(p) > 5: # when is a page non-empty? doc_dict['content'].append({'header': "Page %s" % count, 'text': p.decode('utf-8')}) if len(doc_dict['content']) == 0: doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}] return doc_dict
def scan_PDF_file(root_path): # pages = layout_scanner.get_pages(root_path) # string = pages[0] # print string stringList = [] palist = [] pathList = os.listdir(root_path) # print pathList for i in range(0, len(pathList)): path = os.path.join(root_path, pathList[i]) if os.path.isfile(path): if path.endswith('pdf'): pages = layout_scanner.get_pages(path) string = pages[0] stringList.append(string) palist.append(pathList[i]) # s = string.split('\n') # print string # seaprate_list(s) return stringList, palist
def pdf_to_text_list(file_loc): """ Extracts text (string) of PDF file contents. Images, figures are ignored. :param str file_loc: Path to .PDF document on local disk :return: The last 10 pages of the PDF document as string text, a list of strings :rtype: list """ # Read PDF pages as text pages = layout_scanner.get_pages( file_loc, images_folder=None) # you can try os.path.abspath("output/imgs") try: page_len = len(pages) except TypeError: print("[!] Issue parsing PDF file", file=sys.stderr) return (-1, []) # Take only last 10 pages (We assume references never take more) TODO:HARDCODE pages = pages[-10:] return (page_len, pages)
def convPdf(portfolios_dir, f, verbose=0): pdf_filepath = os.path.join(portfolios_dir, f) toc = layout_scanner.get_toc(pdf_filepath) if verbose > 0: print "TOC -- not currently used for dict" if verbose > 0: for e in toc: print e pages = layout_scanner.get_pages(pdf_filepath) doc_dict = {'doctype': "report", 'origin': "shareworks"} doc_dict['_id'] = f doc_dict['student_email'] = email_from_fname(f) doc_dict['content'] = [] for (count, p) in enumerate(pages): if len(p) > 5: # when is a page non-empty? doc_dict['content'].append({ 'header': "Page %s" % count, 'text': p.decode('utf-8') }) if len(doc_dict['content']) == 0: doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}] return doc_dict
#!/usr/bin/evn python # -*- coding: utf-8 -*- import layout_scanner as ls name = "m2.pdf" rsname = "m2rs.txt" fd = open(rsname, "wb+") toc = ls.get_toc(name) for u in toc: print "level", u[0] print "title", u[1] tc = ls.get_pages(name, images_folder='img') for u in tc: #fd.write(u.decode('utf-8')) print "type:", type(u) print "str:",u.decode('utf-8') a = u.decode('utf-8') print "len u:", len(u) print "len a:", len(a) #for c in u: #print "type c:", type(c) #print c.decode('utf-8') fd.write(u.decode('utf-8'))
""" Analyse a saved pdf """ import layout_scanner from tabula import read_pdf import pandas as pd import csv # Save file csv_file = 'save_file.csv' ISIN = 'NA' pages = layout_scanner.get_pages('orcadia_pdf.pdf') dfs = [] table_pages = [] table_page_title = 'Statement of investments and other net assets' toc_title = 'table of contents' table_end = 'The accompanying notes' start_row = 3 total_row_len = 3 side_column_titles = ['Asset Type', 'Country'] # Find the pages with the tables of interest for page_number, page in enumerate(pages): if table_page_title in ''.join( page) and not toc_title in ''.join(page).lower(): table_pages.append(page) rows = []
def txtparser(self, response): pmspiderItem = pmScrapeItem() pdf_file = basename(response.url) ### clean pm_page_one ### pmspiderItem['pm_page_one'] = ' '.join(convert(pdf_file, pages=[0]).split()) pm_page_one=' '.join(convert(pdf_file, pages=[0]).split()).encode('utf-8').lower() pm_page_one = pm_page_one.replace('classification','') replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation)) pm_page_one = pm_page_one.translate(replace_punctuation) pm_page_one = re.sub(' +',' ',pm_page_one.lower()) ### typo correction ### # pm_page_one = pm_page_one.replace('somatropin','somatotropin') pm_page_one = pm_page_one.replace('p r o d u c t m o n o g r a p h','') pm_page_one = pm_page_one.replace('product monograph','') f = open('pmpageone.txt','w') f.write(pm_page_one) f.close() pmspiderItem['content'] = ' '.join(convert(pdf_file).split()) content = ' '.join(convert(pdf_file).split()) f = open('/home/hjiang/pmscrapy/pdf_text/pdftext.txt','w') f.write(content) lang = language.from_file('/home/hjiang/pmscrapy/pdf_text/pdftext.txt') pmspiderItem['language'] = lang f.close() temp = (response.url).split('file://')[1] fp = open(temp) parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = layout_scanner.get_pages(response.url) pmspiderItem['file_type']='PDF' pmspiderItem['pm_number']=splitext(basename(response.url))[0].decode('utf-8') pm_number = splitext(basename(response.url))[0].decode('utf-8') pmspiderItem['file_path']='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number file_path='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number pmspiderItem['file_name']=basename(file_path).decode('utf-8') pmspiderItem['date_scraped']=datetime.datetime.now() pmspiderItem['server']=socket.gethostname() pmspiderItem['project']=self.settings.get('BOT_NAME') pmspiderItem['spider']=self.name pmspiderItem['content_length']=len(content) # pt_term_index = [] # pt_term_index=findItem(ptpm_list,pm_number) # if pt_term_index == []: # pmspiderItem['pt_term'] = u'NA' # pmspiderItem['pt_term_index'] = u'NA' # else: # pmspiderItem['pt_term'] = ptpm_list[pt_term_index[0][0]][1].decode("utf-8") # pmspiderItem['pt_term_index'] = str(pt_term_index[0][0]).decode("utf-8") count = 0 for k in range(len(name_list)): if count >= 1: break text = name_list[k].translate(replace_punctuation) ele_list = text.split(' ') if len(ele_list) <= 4: ele_list = list(itertools.permutations(ele_list)) else: ele_list = [' '.join(ele_list)] for i in range(len(ele_list)): ele_list[i] = ' '.join(ele_list[i]) if ele_list[i].lower() in pm_page_one.lower(): content_index = k + 1 pmspiderItem['atc_code']=content_list[content_index][0] pmspiderItem['synonyms']=content_list[content_index][1] pmspiderItem['categories']=content_list[content_index][3] pmspiderItem['dosages']=content_list[content_index][4] pmspiderItem['matchiterm'] = name_list[k] count = count + 1 print('yes') break # if count == 0: # if synonyms_list[k] == '': # print('empty list') # break # else: # for synonyms in synonyms_list[k]: # if synonyms == '': # print('missing value') # break # if synonyms.lower() in pm_page_one.lower(): # print("This is synonyms blablabla:%s"%synonyms) # content_index = k + 1 # pmspiderItem['atc_code']=content_list[content_index][0] # pmspiderItem['synonyms']=content_list[content_index][1] # pmspiderItem['categories']=content_list[content_index][3] # pmspiderItem['dosages']=content_list[content_index][4] # pmspiderItem['matchiterm'] = synonyms # count = count + 1 # print('yes1') # break if count == 0: pmspiderItem['atc_code']= u'NA' pmspiderItem['synonyms']= u'NA' pmspiderItem['categories']= u'NA' pmspiderItem['dosages']= u'NA' pmspiderItem['matchiterm'] = u'NA' print('no') os.remove(temp) return pmspiderItem
import shutil import layout_scanner from Pdf2Html import Pdf2Html #Make folder named with current time 'now' now = raw_input("Please enter the name of the pdf file (e.g. test.pdf)\n") # print(os.path.isfile(os.getcwd()+'/test/'+now)) if os.path.isfile(os.getcwd() + '/test/' + now): now = now[0:len(now) - 4] if (os.path.exists(str(os.getcwd()) + '/' + now)): shutil.rmtree(os.getcwd() + '/' + now) os.makedirs(now) os.makedirs(now + '/css') os.makedirs(now + '/img') os.chdir(os.getcwd() + '/' + now) # print(os.getcwd()) # print(os.pardir+'/test/'+now+'.pdf') # print(os.path.isfile(os.pardir+'/test/'+now+'.pdf')) #Call the PDFminer parser, returns tree of PDF components. myparser = layout_scanner.get_pages(os.pardir + '/test/' + now + '.pdf') #Instantiate object from my converter Pdf2Html class converterObj = Pdf2Html() #Create html file 'index.html' converterObj.make_html(myparser)
def getPageWiseText(bookPath): pages = layout_scanner.get_pages(bookPath) return pages
def scanPDF(path): string = '' pages = layout_scanner.get_pages(path) if pages: string = pages[0] return string
import layout_scanner from cStringIO import StringIO file_and_name = '/home/hjiang/pmscrapy/pdf_folder/00000001.PDF' folder = '/home/hjiang/scrapy/' images_folder = '/homne/hjiang/scrapy/tmp/' file_name = '00002207.PDF' output_name = 'test2.txt' pwd = '' toc=layout_scanner.get_toc(file_and_name, pwd) len(toc) pages=layout_scanner.get_pages(file_and_name) # len(pages) for page in pages: layout_scanner.write_file(folder,output_name,page)