def convPdf(portfolios_dir, f, verbose=0):
    pdf_filepath = os.path.join(portfolios_dir, f)
    toc=layout_scanner.get_toc(pdf_filepath)
    if verbose > 0: print "TOC -- not currently used for dict"
    if verbose > 0:
        for e in toc: print e
    pages=layout_scanner.get_pages(pdf_filepath)
    doc_dict = {'doctype': "report", 'origin': "shareworks"}
    doc_dict['_id'] = f
    doc_dict['student_email'] = email_from_fname(f)
    doc_dict['content'] = []
    for (count, p) in enumerate(pages):
        if len(p) > 5: # when is a page non-empty?
            doc_dict['content'].append({'header': "Page %s" % count, 'text': p.decode('utf-8')})
    if len(doc_dict['content']) == 0:
        doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}]
    
    return doc_dict
def convPdf(portfolios_dir, f, verbose=0):
    pdf_filepath = os.path.join(portfolios_dir, f)
    toc = layout_scanner.get_toc(pdf_filepath)
    if verbose > 0: print "TOC -- not currently used for dict"
    if verbose > 0:
        for e in toc:
            print e
    pages = layout_scanner.get_pages(pdf_filepath)
    doc_dict = {'doctype': "report", 'origin': "shareworks"}
    doc_dict['_id'] = f
    doc_dict['student_email'] = email_from_fname(f)
    doc_dict['content'] = []
    for (count, p) in enumerate(pages):
        if len(p) > 5:  # when is a page non-empty?
            doc_dict['content'].append({
                'header': "Page %s" % count,
                'text': p.decode('utf-8')
            })
    if len(doc_dict['content']) == 0:
        doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}]

    return doc_dict
#!/usr/bin/evn python
# -*- coding: utf-8 -*-

import layout_scanner as ls

name = "m2.pdf"
rsname = "m2rs.txt"
fd = open(rsname, "wb+")

toc = ls.get_toc(name)

for u in toc:
	print "level", u[0]
	print "title", u[1]

tc = ls.get_pages(name, images_folder='img')
for u in tc:
	#fd.write(u.decode('utf-8'))
	print "type:", type(u)
	print "str:",u.decode('utf-8')
	a = u.decode('utf-8')
	print "len u:", len(u)
	print "len a:", len(a)
	#for c in u:
		#print "type c:", type(c)
		#print c.decode('utf-8')
	fd.write(u.decode('utf-8'))
Example #4
0
import layout_scanner

toc=layout_scanner.get_toc('/Users/mattstringer/Dropbox/ProyectoLaCumbre/DataClean/pdfs/example_finca.pdf')
def getTOCFromLayoutScanner(filePath):
    
    toc = layout_scanner.get_toc(filePath)
    return toc
import layout_scanner
toc = layout_scanner.get_toc(
    '/home/dpkm95/pes/4sem/ada/mini-project/Algorithms.pdf')
print toc
Example #7
0
import layout_scanner

toc = layout_scanner.get_toc(
    '/Users/mattstringer/Dropbox/ProyectoLaCumbre/DataClean/pdfs/example_finca.pdf'
)
Example #8
0
import layout_scanner
from cStringIO import StringIO

file_and_name = '/home/hjiang/pmscrapy/pdf_folder/00000001.PDF'
folder = '/home/hjiang/scrapy/'
images_folder = '/homne/hjiang/scrapy/tmp/'
file_name = '00002207.PDF'
output_name = 'test2.txt'
pwd = ''

toc=layout_scanner.get_toc(file_and_name, pwd)
len(toc)


pages=layout_scanner.get_pages(file_and_name)
# len(pages)

for page in pages:
  layout_scanner.write_file(folder,output_name,page)