import csv import pdftableextract as pte import csv import os import os.path import glob for f in glob.glob("www.city.kobe.lg.jp/child/grow/shinseido/img/*.pdf"): fn = os.path.basename(f) out = "shinseido/%s.ttl" % fn if os.path.exists(out) and os.stat(out).st_mtime >= os.stat(f).st_mtime: continue info = [r for r in csv.DictReader(open("shinseido_meta/index.csv", encoding="UTF-8")) if r["file"] == fn] assert info, fn pages = [int(info[0][k]) for k in ("start","end")] assert pages, fn for page in range(*pages): a = pte.process_page(f, str(page)) x = pte.table_to_list(a, page) with open("shinseido/%s.p%02d.csv" % (fn, page), "w") as o: w = csv.writer(o) w.writerows(x[-1])
""" print "cells:" for cell in cells: #cells: list of tuple tempStr = "" for item in cell: #cell: tuple tempStr += str(item) + ", " print tempStr """ #check whether to deal with the multiple tables in the same pages here? SEEMED NOT, no obvious differences between two different tables.(通过横跨所有列来判断,也不保险,因为有些表格的中间可能存在一行横跨所有列的数据) #------------------------------------------------------------------------------------------------------------------------------ #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' li = pdf.table_to_list(cells, pages)[-1] print "li: " #type(li) #<type 'list'> list of list for line in li: print ", ".join(line) """ li: [ ["content of cell 0 in row 0", "content of cell 1 in row 0",... , "content of cell n in row 0"], ["content of cell 0 in row 1", "content of cell 1 in row 1",... , "content of cell n in row 1"], ... ["content of cell 0 in row m", "content of cell 1 in row m",... , "content of cell n in row m"] ] """ #li is a list of lists, the first line is the header, last is the footer (for this table only!) #column '0' contains store names
def search((inputFile,search)): baseName = os.path.basename(inputFile) inputName, inputExtension = os.path.splitext(baseName) print inputName regionName = inputFile.split("/")[-2] fr = open(inputFile, 'rb') try: pdf = pyPdf.PdfFileReader(fr) pages = pdf.getNumPages() count = 0 hits = [] for page in range(0,pages): spin() pdf_page = pdf.getPage(page) text = pdf_page.extractText() if text.find(search)>-1: if count < 2: hits.append(str(page+1)) count += 1 fr.close() cells = [pdfextract.process_page(inputFile,p) for p in hits] cells = [item for sublist in cells for item in sublist ] li = pdfextract.table_to_list(cells, hits) if len(li)>1: table1 = li[-2][1:-1] table2 = li[-1][1:-1] data = [] for row in table1: spin() parsedText = re.findall("[^0-9]{2,}",row[0]) if len(parsedText)>1: for i in range(0,len(parsedText)): text = parsedText[i] startIndex = row[0].index(text)+len(text) endIndex = row[0].index(parsedText[i+1]) if i+1<len(parsedText) else len(row[0]) parsedNum = row[0][startIndex:endIndex].split(" ") parsedRow = [] parsedRow.append(text.strip()) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) elif len(parsedText)==1: parsedRow = [] parsedRow.append(parsedText[0].strip()) parsedNum = re.findall("[0-9.,]+",row[0][len(parsedText[0]):]) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) else: parsedRow = [] parsedRow.append("") parsedNum = re.findall("[0-9.,]+",row[0]) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) for row in table2: spin() if row[0].find("Page")==-1: parsedText = re.findall("[^0-9]{2,}",row[0]) if len(parsedText)>1: for i in range(0,len(parsedText)): text = parsedText[i] startIndex = row[0].index(text)+len(text) endIndex = row[0].index(parsedText[i+1]) if i+1<len(parsedText) else len(row[0]) parsedNum = row[0][startIndex:endIndex].split(" ") parsedRow = [] parsedRow.append(text.strip()) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) elif len(parsedText)==1: parsedRow = [] parsedRow.append(parsedText[0].strip()) parsedNum = re.findall("[0-9.,]+",row[0][len(parsedText[0]):]) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) else: parsedRow = [] parsedRow.append("") parsedNum = re.findall("[0-9.,]+",row[0]) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"] data = pd.DataFrame(data,columns=cols) return data else: cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"] data = [[regionName,inputName,"ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR"]] data = pd.DataFrame(data,columns=cols) return data except: cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"] data = [[regionName,inputName,"ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR"]] data = pd.DataFrame(data,columns=cols) return data
import pandas as pd import pdftableextract as pdf pages = ["1"] cells = [pdf.process_page("example.pdf",p) for p in pages] #flatten the cells structure cells = [item for sublist in cells for item in sublist ] #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' li = pdf.table_to_list(cells, pages)[1] #li is a list of lists, the first line is the header, last is the footer (for this table only!) #column '0' contains store names #row '1' contains column headings #data is row '2' through '-1' data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
def transform(filename, console=None): write(console, " - Parsing...", ending='') tablePdf = pdf.PdfFileReader (file(filename, 'rb')) pages = [ str(p) for p in range(1, len(tablePdf.pages) + 1)] # Procesing cells and flattern cells structure cells = [pdftable.process_page(filename, p) for p in pages] cells = [item for sublist in cells for item in sublist] table = [] write(console, "done") for page_table in pdftable.table_to_list(cells, pages): row_msg_format = "\r\033[K - %d/%d records transformed" page_table_rows = len(page_table) idx = 0 while idx < page_table_rows: write(console, row_msg_format % (idx, page_table_rows), ending='') cell_len = sum([len(i) for i in page_table[idx]]) if (cell_len == 0) or is_header_text(page_table[idx][0]): del page_table[idx] page_table_rows -= 1 else: # Unicode for all for subidx in xrange(len(page_table[idx])): page_table[idx][subidx] = unicode(page_table[idx][subidx], encoding='utf-8').strip() # Cases "1. medicament one 2. medicament two" if re.search(r'\d\.\W\W', page_table[idx][3]): re_split_cases = r'\W?\d+\.\W\W' splitted_products = re.split(re_split_cases, page_table[idx][3])[1:] splitted_vendors = re.split(re_split_cases, page_table[idx][4])[1:] added_products = len(splitted_products) if not len(splitted_vendors): splitted_vendors = [ page_table[idx][4] ] * added_products for i in xrange(0, added_products): if i > 0: row_copy = list(page_table[idx]) idx += 1 page_table_rows += 1 page_table.insert(idx, row_copy) page_table[idx][3] = unicode(splitted_products[i]).encode('utf-8') page_table[idx][4] = unicode(splitted_vendors[i]).encode('utf-8') idx += 1 table += page_table write(console, row_msg_format % (idx, page_table_rows), ending='') write(console, '') return table
def process(start, end): '''This method processes the specified results and populate necessary data structures.''' global result, exam badresult = [] for count in range(start, end + 1): try: if verbosity == 1: print "Roll Number #", count else: sys.stdout.write( "\r%.2f%%" % (float(count - start) * 100 / (end - start))) sys.stdout.flush() pages = ["1"] f = open("result" + str(count) + ".pdf", "rb") PdfFileReader(f) # Checking if valid pdf file f.close() cells = [pdf.process_page("result" + str(count) + ".pdf", p) for p in pages] cells = [item for sublist in cells for item in sublist] li = pdf.table_to_list(cells, pages)[1] for i in li: if 'Branch' in i[0]: collegepos = i[0].index('College : ') branchpos = i[0].index('Branch : ') namepos = i[0].index('Name : ') registerpos = i[0].index('Register No : ') exampos = i[0].index('Exam Name : ') college = i[0][collegepos:branchpos][9:].strip().title() branch = i[0][branchpos:namepos][9:].strip().title() exam = i[0][exampos:][11:].strip().title() register = i[0][registerpos:exampos][13:].strip() if college not in result: result[college] = {} if branch not in result[college]: result[college][branch] = {} elif 'Mahatma' in i[0]: pass elif 'Sl. No' in i[0]: pass elif 'Semester Result' in i[1]: pass else: subject = [i][0][1] internal = i[2] external = i[3] if internal == '-': internal = 0 else: internal = int(internal) if external == '-': external = 0 else: external = int(external) res = i[5] if subject not in result[college][branch]: result[college][branch][subject] = {} result[college][branch][subject][register] = \ [external, res] except: badresult.append(count) continue if(len(badresult) > 0): print "\nUnavailable Results Skipped" for invalid in badresult: print "Roll Number #", invalid jsonout = json.dumps(result) outfile = open('output.json', 'w') outfile.write(jsonout) outfile.close() print ""
def process(self, start, end, parentfolder): ''' This method processes the specified results and populate necessary data structures. ''' self.badresult = [] self.registers = {} self.subjects = {} result_pdf_path = os.path.join(parentfolder, 'Results') for count in range(start, end + 1): try: pages = ["1"] filename = "result" + str(count) + ".pdf" filepath = os.path.join(result_pdf_path, filename) f = open(filepath, "rb") PdfFileReader(f) # Checking if valid pdf file f.close() cells = [pdf.process_page(filepath, p) for p in pages] cells = [item for sublist in cells for item in sublist] li = pdf.table_to_list(cells, pages)[1] for i in li: if 'Branch' in i[0]: collegepos = i[0].index('College : ') branchpos = i[0].index('Branch : ') namepos = i[0].index('Name : ') registerpos = i[0].index('Register No : ') exampos = i[0].index('Exam Name : ') college = i[0][collegepos:branchpos][ 9:].strip().title() branch = i[0][branchpos:namepos][9:].strip().title() exam = i[0][exampos:][11:].strip().title() register = i[0][registerpos:exampos][13:].strip() name = i[0][namepos:registerpos][7:].strip() if college not in self.result_subject: self.result_subject[college] = {} if college not in self.result_register: self.result_register[college] = {} if college not in self.registers: self.registers[college] = {} if branch not in self.result_subject[college]: self.result_subject[college][branch] = {} if branch not in self.result_register[college]: self.result_register[college][branch] = {} if branch not in self.registers[college]: self.registers[college][branch] = [] if branch not in self.subjects: self.subjects[branch] = [] elif 'Mahatma' in i[0]: pass elif 'Sl. No' in i[0]: pass elif 'Semester Result' in i[1]: pass else: subject = [i][0][1] internal = i[2] external = i[3] if internal == '-': internal = 0 else: internal = int(internal) if external == '-': external = 0 else: external = int(external) res = i[5] if register not in self.registers[college][branch]: self.registers[college][branch].append(register) if subject not in self.subjects[branch]: self.subjects[branch].append(subject) if register not in self.result_register[ college][branch]: self.result_register[college][ branch][register] = {} self.result_register[college][ branch][register]["name"] = name if subject not in self.result_register[ college][branch][register]: self.result_register[college][ branch][register][subject] = {} self.result_register[college][branch][register][subject] = \ [internal, external, internal + external, res] if subject not in self.result_subject[college][branch]: self.result_subject[college][branch][subject] = {} self.result_subject[college][branch][subject][register] = \ [external, res] current = self.parent.progressbar2.value() unit = 100.0 / float(end - start) if current == -1: current = 0 self.parent.progressbar2.setValue(current + unit) except Exception as e: self.badresult.append(count) continue self.parent.progressbar2.setValue(100) jsonout = json.dumps(self.result_register, indent=4) json1path = os.path.join(parentfolder, 'output_register.json') outfile = open(json1path, 'w') outfile.write(jsonout) outfile.close() jsonout2 = json.dumps(self.result_subject, indent=4) json2path = os.path.join(parentfolder, 'output_subject.json') outfile2 = open(json2path, 'w') outfile2.write(jsonout2) outfile2.close() return self.badresult
def transform(filename, console=None): write(console, " - Parsing...", ending='') tablePdf = pdf.PdfFileReader(file(filename, 'rb')) pages = [str(p) for p in range(1, len(tablePdf.pages) + 1)] # Procesing cells and flattern cells structure cells = [pdftable.process_page(filename, p) for p in pages] cells = [item for sublist in cells for item in sublist] table = [] write(console, "done") for page_table in pdftable.table_to_list(cells, pages): row_msg_format = "\r\033[K - %d/%d records transformed" page_table_rows = len(page_table) idx = 0 while idx < page_table_rows: write(console, row_msg_format % (idx, page_table_rows), ending='') cell_len = sum([len(i) for i in page_table[idx]]) if (cell_len == 0) or is_header_text(page_table[idx][0]): del page_table[idx] page_table_rows -= 1 else: # Unicode for all for subidx in xrange(len(page_table[idx])): page_table[idx][subidx] = unicode( page_table[idx][subidx], encoding='utf-8').strip() # Cases "1. medicament one 2. medicament two" if re.search(r'\d\.\W\W', page_table[idx][3]): re_split_cases = r'\W?\d+\.\W\W' splitted_products = re.split(re_split_cases, page_table[idx][3])[1:] splitted_vendors = re.split(re_split_cases, page_table[idx][4])[1:] added_products = len(splitted_products) if not len(splitted_vendors): splitted_vendors = [page_table[idx][4] ] * added_products for i in xrange(0, added_products): if i > 0: row_copy = list(page_table[idx]) idx += 1 page_table_rows += 1 page_table.insert(idx, row_copy) page_table[idx][3] = unicode( splitted_products[i]).encode('utf-8') page_table[idx][4] = unicode( splitted_vendors[i]).encode('utf-8') idx += 1 table += page_table write(console, row_msg_format % (idx, page_table_rows), ending='') write(console, '') return table
def proc(input, output): if os.path.exists(output) and os.stat(output).st_mtime > os.stat(input).st_mtime: return rs = pte.table_to_list(pte.process_page(input, "1"), 1) w = csv.writer(open(output, "w")) w.writerows(rs[1])