def bkpage(pdf_name): f = open(pdf_name, 'rb') pdf = PdfFileReader(f) # map page ids to page numbers pg_id_num_map = _setup_page_id_to_num(pdf) outlines = pdf.getOutlines() bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map) #print bookmarks_info page_title = {} for key in bookmarks_info.keys(): a = bookmarks_info[key] page_title[a['title'].encode('utf-8')] = int(a['page']) #print page_title a = sorted(page_title.items(), key=lambda (k, v): (v, k)) #print a l = [] title_tokens = [] for x in a: l.append(x[0]) title_tokens.append(x[0].split()) #print x[1], ":", x[0] return (page_title, l, title_tokens)
def get_outlines(pdf_file_path): "Get outlines" if not pdf_file_path: raise # TODO with open(pdf_file_path) as f: pdf_reader = PdfFileReader(f) return pdf_reader.getOutlines()
intr = open(subheading[i],'w') for k in xrange(len(part)-1): intr.write(part[k]) c=0 part=[] elif i+1==len(subheading): intr = open(subheading[i],'w') for k in xrange(len(part)-1): intr.write(part[k]) path = "C:\Users\Wu/nordron-sciinfo\Code\Rainy\code\extraction" #please change path article = textract.process('y.pdf', m='pdfminer') #please chang the file name f = open('y.pdf', 'rb') #import PDF file p = PdfFileReader(f) o = p.getOutlines() #read outlines in pdf list = [] dimension(o, list) pdftotxt(article) #build a list of subtitle subheading = [] for j in range(0,len(list)): sub = list[j]["/Title"] subheading.append(sub) #title=title_extractor(path, filename) myfile = open("pdf_1.txt") line = myfile.readlines() # read txt file line by line split(line,subheading,path)