def process_pdf(path):
    """Extract text from PDF file using PDFMiner with whitespace inatact."""
    str = ""
    try:
        pages = layout_scanner.get_pages(path)
        i = 0
        l = len(pages)
        while i < l:
            str += pages[i]
            i += 1
    except Exception, e:
        return g_error_template % e, ""
Beispiel #2
0
def pdf_extract(filename):

    fp = open('output\pdf.txt', 'wb')
    pages = layout_scanner.get_pages(filename)
    # Calculating number of pages for the PDF document
    length = len(pages)
    i = 0
    # Running the loop and saving text contents for each page
    while i < length:
        fp.write(pages[i] + "\n")
        i = i + 1
    fp.close()
Beispiel #3
0
def pdf_extract(filename):
    
    fp = open('output\pdf.txt', 'wb')
    pages=layout_scanner.get_pages(filename)
    # Calculating number of pages for the PDF document
    length=len(pages)
    i=0
    # Running the loop and saving text contents for each page
    while i<length:
        fp.write(pages[i]+"\n")
        i=i+1
    fp.close()
def process_pdf(path):
    """Extract text from PDF file using PDFMiner with whitespace inatact."""
    str = ""

    try:
        pages = layout_scanner.get_pages(path)

        i = 0
        l = len(pages)
        while i < l:
            str += pages[i]
            i = i + 1

    except:
        print "Error load pdf"

    return str
def index_pdf_content(item, url, field_name):
    if url.endswith('.pdf'):
        download_pdf(url.replace(' ', '%20'))
        pdf_file = basename(url).replace(' ', '%20')
        fp = open('slsascrapy/pdf_folder/' + pdf_file)
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = layout_scanner.get_pages(url)
        content = ' '.join(convert(pdf_file).split())
        item[field_name] += content
        os.remove('slsascrapy/pdf_folder/' + pdf_file)
    else:
        item[field_name] = u'NA'
def convPdf(portfolios_dir, f, verbose=0):
    pdf_filepath = os.path.join(portfolios_dir, f)
    toc=layout_scanner.get_toc(pdf_filepath)
    if verbose > 0: print "TOC -- not currently used for dict"
    if verbose > 0:
        for e in toc: print e
    pages=layout_scanner.get_pages(pdf_filepath)
    doc_dict = {'doctype': "report", 'origin': "shareworks"}
    doc_dict['_id'] = f
    doc_dict['student_email'] = email_from_fname(f)
    doc_dict['content'] = []
    for (count, p) in enumerate(pages):
        if len(p) > 5: # when is a page non-empty?
            doc_dict['content'].append({'header': "Page %s" % count, 'text': p.decode('utf-8')})
    if len(doc_dict['content']) == 0:
        doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}]
    
    return doc_dict
Beispiel #7
0
def scan_PDF_file(root_path):
    # pages = layout_scanner.get_pages(root_path)
    # string = pages[0]
    # print string
    stringList = []
    palist = []
    pathList = os.listdir(root_path)
    # print pathList
    for i in range(0, len(pathList)):
        path = os.path.join(root_path, pathList[i])
        if os.path.isfile(path):
            if path.endswith('pdf'):
                pages = layout_scanner.get_pages(path)
                string = pages[0]
                stringList.append(string)
                palist.append(pathList[i])
                # s = string.split('\n')
                # print string
                # seaprate_list(s)
    return stringList, palist
def pdf_to_text_list(file_loc):
    """
     Extracts text (string) of PDF file contents. Images, figures are ignored.
    :param str file_loc: Path to .PDF document on local disk
    :return: The last 10 pages of the PDF document as string text, a list of strings
    :rtype: list
    """
    # Read PDF pages as text
    pages = layout_scanner.get_pages(
        file_loc,
        images_folder=None)  # you can try os.path.abspath("output/imgs")
    try:
        page_len = len(pages)
    except TypeError:
        print("[!] Issue parsing PDF file", file=sys.stderr)
        return (-1, [])

    # Take only last 10 pages (We assume references never take more) TODO:HARDCODE
    pages = pages[-10:]

    return (page_len, pages)
def convPdf(portfolios_dir, f, verbose=0):
    pdf_filepath = os.path.join(portfolios_dir, f)
    toc = layout_scanner.get_toc(pdf_filepath)
    if verbose > 0: print "TOC -- not currently used for dict"
    if verbose > 0:
        for e in toc:
            print e
    pages = layout_scanner.get_pages(pdf_filepath)
    doc_dict = {'doctype': "report", 'origin': "shareworks"}
    doc_dict['_id'] = f
    doc_dict['student_email'] = email_from_fname(f)
    doc_dict['content'] = []
    for (count, p) in enumerate(pages):
        if len(p) > 5:  # when is a page non-empty?
            doc_dict['content'].append({
                'header': "Page %s" % count,
                'text': p.decode('utf-8')
            })
    if len(doc_dict['content']) == 0:
        doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}]

    return doc_dict
#!/usr/bin/evn python
# -*- coding: utf-8 -*-

import layout_scanner as ls

name = "m2.pdf"
rsname = "m2rs.txt"
fd = open(rsname, "wb+")

toc = ls.get_toc(name)

for u in toc:
	print "level", u[0]
	print "title", u[1]

tc = ls.get_pages(name, images_folder='img')
for u in tc:
	#fd.write(u.decode('utf-8'))
	print "type:", type(u)
	print "str:",u.decode('utf-8')
	a = u.decode('utf-8')
	print "len u:", len(u)
	print "len a:", len(a)
	#for c in u:
		#print "type c:", type(c)
		#print c.decode('utf-8')
	fd.write(u.decode('utf-8'))
Beispiel #11
0
"""
Analyse a saved pdf
"""

import layout_scanner
from tabula import read_pdf
import pandas as pd
import csv

# Save file
csv_file = 'save_file.csv'
ISIN = 'NA'

pages = layout_scanner.get_pages('orcadia_pdf.pdf')
dfs = []
table_pages = []

table_page_title = 'Statement of investments and other net assets'
toc_title = 'table of contents'
table_end = 'The accompanying notes'
start_row = 3
total_row_len = 3
side_column_titles = ['Asset Type', 'Country']

# Find the pages with the tables of interest
for page_number, page in enumerate(pages):
    if table_page_title in ''.join(
            page) and not toc_title in ''.join(page).lower():
        table_pages.append(page)

rows = []
 def txtparser(self, response):
     pmspiderItem = pmScrapeItem()
     pdf_file = basename(response.url)
     
     ### clean pm_page_one ###
     pmspiderItem['pm_page_one'] = ' '.join(convert(pdf_file, pages=[0]).split())
     pm_page_one=' '.join(convert(pdf_file, pages=[0]).split()).encode('utf-8').lower()
     pm_page_one = pm_page_one.replace('classification','')
     replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
     pm_page_one = pm_page_one.translate(replace_punctuation)
     pm_page_one = re.sub(' +',' ',pm_page_one.lower())
     ### typo correction ###
     # pm_page_one = pm_page_one.replace('somatropin','somatotropin')
     pm_page_one = pm_page_one.replace('p r o d u c t m o n o g r a p h','')
     pm_page_one = pm_page_one.replace('product monograph','')
     f = open('pmpageone.txt','w')
     f.write(pm_page_one)
     f.close()
     
     pmspiderItem['content'] = ' '.join(convert(pdf_file).split())
     content = ' '.join(convert(pdf_file).split())
     f = open('/home/hjiang/pmscrapy/pdf_text/pdftext.txt','w')
     f.write(content)
     lang = language.from_file('/home/hjiang/pmscrapy/pdf_text/pdftext.txt')
     pmspiderItem['language'] = lang
     f.close()
     temp = (response.url).split('file://')[1]
     fp = open(temp)
     parser = PDFParser(fp)
     doc = PDFDocument(parser)
     parser.set_document(doc)
     rsrcmgr = PDFResourceManager()
     laparams = LAParams()
     device = PDFPageAggregator(rsrcmgr, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     pages = layout_scanner.get_pages(response.url)
     
     pmspiderItem['file_type']='PDF'
     pmspiderItem['pm_number']=splitext(basename(response.url))[0].decode('utf-8')
     pm_number = splitext(basename(response.url))[0].decode('utf-8')
     pmspiderItem['file_path']='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number
     file_path='https://pdf.hres.ca/dpd_pm/%s.PDF'%pm_number
     pmspiderItem['file_name']=basename(file_path).decode('utf-8')
     pmspiderItem['date_scraped']=datetime.datetime.now()
     pmspiderItem['server']=socket.gethostname()
     pmspiderItem['project']=self.settings.get('BOT_NAME')
     pmspiderItem['spider']=self.name
     pmspiderItem['content_length']=len(content)
   
     
     # pt_term_index = []
     # pt_term_index=findItem(ptpm_list,pm_number)
     # if pt_term_index == []:
     #     pmspiderItem['pt_term'] = u'NA'
     #     pmspiderItem['pt_term_index'] = u'NA'
     # else:
     #     pmspiderItem['pt_term'] = ptpm_list[pt_term_index[0][0]][1].decode("utf-8")
     #     pmspiderItem['pt_term_index'] = str(pt_term_index[0][0]).decode("utf-8")
     
     count = 0
     for k in range(len(name_list)):
         if count >= 1:
             break
         text = name_list[k].translate(replace_punctuation)
         ele_list = text.split(' ')
         if len(ele_list) <= 4:
             ele_list = list(itertools.permutations(ele_list))
         else:
             ele_list = [' '.join(ele_list)]
         for i in range(len(ele_list)):
             ele_list[i] = ' '.join(ele_list[i])
             if ele_list[i].lower() in pm_page_one.lower():
                 content_index = k + 1
                 pmspiderItem['atc_code']=content_list[content_index][0]
                 pmspiderItem['synonyms']=content_list[content_index][1]
                 pmspiderItem['categories']=content_list[content_index][3]
                 pmspiderItem['dosages']=content_list[content_index][4]
                 pmspiderItem['matchiterm'] = name_list[k]
                 count = count + 1
                 print('yes')
                 break
         # if count == 0:
         #     if synonyms_list[k] == '':
         #         print('empty list')
         #         break
         #     else:
         #         for synonyms in synonyms_list[k]:
         #             if synonyms == '':
         #                 print('missing value')
         #                 break
         #             if synonyms.lower() in pm_page_one.lower():
         #                 print("This is synonyms blablabla:%s"%synonyms)
         #                 content_index = k + 1
         #                 pmspiderItem['atc_code']=content_list[content_index][0]
         #                 pmspiderItem['synonyms']=content_list[content_index][1]
         #                 pmspiderItem['categories']=content_list[content_index][3]
         #                 pmspiderItem['dosages']=content_list[content_index][4]
         #                 pmspiderItem['matchiterm'] = synonyms
         #                 count = count + 1
         #                 print('yes1')
         #                 break
     if count == 0:
         pmspiderItem['atc_code']= u'NA'
         pmspiderItem['synonyms']= u'NA'
         pmspiderItem['categories']= u'NA'
         pmspiderItem['dosages']= u'NA'
         pmspiderItem['matchiterm'] = u'NA'
         print('no')
     os.remove(temp) 
     return pmspiderItem
Beispiel #13
0
import shutil
import layout_scanner
from Pdf2Html import Pdf2Html

#Make folder named with current time 'now'
now = raw_input("Please enter the name of the pdf file (e.g. test.pdf)\n")

# print(os.path.isfile(os.getcwd()+'/test/'+now))
if os.path.isfile(os.getcwd() + '/test/' + now):
    now = now[0:len(now) - 4]
    if (os.path.exists(str(os.getcwd()) + '/' + now)):
        shutil.rmtree(os.getcwd() + '/' + now)

    os.makedirs(now)
    os.makedirs(now + '/css')
    os.makedirs(now + '/img')
    os.chdir(os.getcwd() + '/' + now)

    # print(os.getcwd())
    # print(os.pardir+'/test/'+now+'.pdf')
    # print(os.path.isfile(os.pardir+'/test/'+now+'.pdf'))

    #Call the PDFminer parser, returns tree of PDF components.
    myparser = layout_scanner.get_pages(os.pardir + '/test/' + now + '.pdf')

    #Instantiate object from my converter Pdf2Html class
    converterObj = Pdf2Html()

    #Create html file 'index.html'
    converterObj.make_html(myparser)
def getPageWiseText(bookPath):

    pages = layout_scanner.get_pages(bookPath)
    return pages
Beispiel #15
0
def scanPDF(path):
    string = ''
    pages = layout_scanner.get_pages(path)
    if pages:
        string = pages[0]
    return string
Beispiel #16
0
import layout_scanner
from cStringIO import StringIO

file_and_name = '/home/hjiang/pmscrapy/pdf_folder/00000001.PDF'
folder = '/home/hjiang/scrapy/'
images_folder = '/homne/hjiang/scrapy/tmp/'
file_name = '00002207.PDF'
output_name = 'test2.txt'
pwd = ''

toc=layout_scanner.get_toc(file_and_name, pwd)
len(toc)


pages=layout_scanner.get_pages(file_and_name)
# len(pages)

for page in pages:
  layout_scanner.write_file(folder,output_name,page)