def convPdf(portfolios_dir, f, verbose=0): pdf_filepath = os.path.join(portfolios_dir, f) toc=layout_scanner.get_toc(pdf_filepath) if verbose > 0: print "TOC -- not currently used for dict" if verbose > 0: for e in toc: print e pages=layout_scanner.get_pages(pdf_filepath) doc_dict = {'doctype': "report", 'origin': "shareworks"} doc_dict['_id'] = f doc_dict['student_email'] = email_from_fname(f) doc_dict['content'] = [] for (count, p) in enumerate(pages): if len(p) > 5: # when is a page non-empty? doc_dict['content'].append({'header': "Page %s" % count, 'text': p.decode('utf-8')}) if len(doc_dict['content']) == 0: doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}] return doc_dict
def convPdf(portfolios_dir, f, verbose=0): pdf_filepath = os.path.join(portfolios_dir, f) toc = layout_scanner.get_toc(pdf_filepath) if verbose > 0: print "TOC -- not currently used for dict" if verbose > 0: for e in toc: print e pages = layout_scanner.get_pages(pdf_filepath) doc_dict = {'doctype': "report", 'origin': "shareworks"} doc_dict['_id'] = f doc_dict['student_email'] = email_from_fname(f) doc_dict['content'] = [] for (count, p) in enumerate(pages): if len(p) > 5: # when is a page non-empty? doc_dict['content'].append({ 'header': "Page %s" % count, 'text': p.decode('utf-8') }) if len(doc_dict['content']) == 0: doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}] return doc_dict
#!/usr/bin/evn python # -*- coding: utf-8 -*- import layout_scanner as ls name = "m2.pdf" rsname = "m2rs.txt" fd = open(rsname, "wb+") toc = ls.get_toc(name) for u in toc: print "level", u[0] print "title", u[1] tc = ls.get_pages(name, images_folder='img') for u in tc: #fd.write(u.decode('utf-8')) print "type:", type(u) print "str:",u.decode('utf-8') a = u.decode('utf-8') print "len u:", len(u) print "len a:", len(a) #for c in u: #print "type c:", type(c) #print c.decode('utf-8') fd.write(u.decode('utf-8'))
import layout_scanner toc=layout_scanner.get_toc('/Users/mattstringer/Dropbox/ProyectoLaCumbre/DataClean/pdfs/example_finca.pdf')
def getTOCFromLayoutScanner(filePath): toc = layout_scanner.get_toc(filePath) return toc
import layout_scanner toc = layout_scanner.get_toc( '/home/dpkm95/pes/4sem/ada/mini-project/Algorithms.pdf') print toc
import layout_scanner toc = layout_scanner.get_toc( '/Users/mattstringer/Dropbox/ProyectoLaCumbre/DataClean/pdfs/example_finca.pdf' )
import layout_scanner from cStringIO import StringIO file_and_name = '/home/hjiang/pmscrapy/pdf_folder/00000001.PDF' folder = '/home/hjiang/scrapy/' images_folder = '/homne/hjiang/scrapy/tmp/' file_name = '00002207.PDF' output_name = 'test2.txt' pwd = '' toc=layout_scanner.get_toc(file_and_name, pwd) len(toc) pages=layout_scanner.get_pages(file_and_name) # len(pages) for page in pages: layout_scanner.write_file(folder,output_name,page)