def parsePdfMenu(path, debug=False): table = getTable(path) if (table == None ): return if (debug): print(table) print(table.accuracy) camelot.plot(table, kind='contour') camelot.plot(table, kind='grid') plt.show() week = menu.Week() price_info = parsePriceInfo(table.data[0:1]) for row in table.data[1:]: if (len(row) != 6): print(len(row)+"rows instead of six, aborting") return cleanRow = cleanParsedRow(row) day = parseMenuForDay(cleanRow) day.addPrices(price_info) week.addNextDay(day) return week
def get_pdf_information(): tables = camelot.read_pdf('UFF.pdf', pages='3', flavor='stream', table_area=['68,710, 545,140' ]) #strip_text=' \n') camelot.plot(tables[0], kind='text') #plt.show() # Leave it uncommented when needed table_df = tables[0].df print(type(table_df)) table_df[7:71].to_csv('LJ_data.csv')
def exportPDFTableDataToCSV(): root_dir = os.getcwd() pdf_file = os.path.join(root_dir, 'foo.pdf') tables = camelot.read_pdf(pdf_file) print(tables) ## To export all tables present in the pdf into a csv file # tables.export('foo.csv', f='csv', compress=True) for i in range(len(tables)): print("On Table Num: {}".format(i)) camelot.plot(tables[i], kind='contour')
def __init__(self, parent=None, width=8, height=8, dpi=100): fig = Figure(figsize=(width, height), dpi=200) # 创建一个Figure,注意:该Figure为matplotlib下的figure,不是matplotlib.pyplot下面的figure pdf = camelot.read_pdf(r"C:\Users\localhost\Desktop\石家庄市2018年市本级和全市财政总决算报表.pdf", flavor='stream', pages='5') if pdf: fig = camelot.plot(pdf[0], kind='textedge') fig.set_dpi(150) axis('tight') FigureCanvas.__init__(self, fig) # 初始化父类 self.setParent(parent)
def camelot_pdf_parsing(): pdf_configs = [{ "path": "data/background_lines.pdf", "kwargs": { "process_background": True } }, { "path": "data/PhDThesis.pdf", "kwargs": {} }] for pdf_config in pdf_configs: print("\n#=== Parsing {} =# \n".format(pdf_config["path"])) tables = camelot.read_pdf(pdf_config["path"], **pdf_config["kwargs"]) # , pages="1" print(tables) for ii, table in enumerate(tables): table.to_csv(pdf_config["path"].replace(".pdf", f"_{ii}.csv")) print(table.df) camelot.plot(table, kind='text') camelot.plot(table, kind='grid') plt.show()
def extract_table(table): try: pdf_file_path = pdf_files_folder.joinpath(f"{table.fileId}.pdf") table_areas = [ f"{table.pdfX1},{table.pdfY1},{table.pdfX2},{table.pdfY2}" ] tables = camelot.read_pdf(str(pdf_file_path), table_areas=table_areas, pages=str(table.page), strip_text='\n', line_scale=40, flag_size=True) print(f"found {len(tables)} tables with lattice") if len(tables) == 0: tables = camelot.read_pdf( str(pdf_file_path), table_areas=table_areas, pages=str(table.page), strip_text='\n', flavor="stream", flag_size=True, ) print(f"found {len(tables)} tables with stream") if len(tables) > 0: csv_file_name = csv_tables_folder_path.joinpath( f"{table.uuid}.csv") tables[0].to_csv(csv_file_name, index=False, header=False) df = pd.read_csv( csv_file_name, na_filter=False, skip_blank_lines=False, header=None, ) df.to_html(html_tables_folder_path.joinpath(f"{table.uuid}.html"), index=False, header=False, encoding="utf-8-sig", na_rep=" ") fig = camelot.plot(tables[0], kind='contour') fig.suptitle(table.uuid) plt.show() else: print(f">>>> No tables found for table ID {table.uuid}") except Exception as e: print(f"==== Error extracting table ID {table.uuid} ======") print(e) print(f"======================================")
def on_check_btn_clicked(self): # self.find_code_by_name() if self.currentPageEdit.text().strip() == "" or self.pathEdit.text( ).strip() == "": QMessageBox.information(self, "提示", ' 输入不能为空! ') return elif not self.pathEdit.text().strip().endswith(".pdf"): QMessageBox.information(self, "提示", ' 只有PDF文件需要次操作! ') return else: print(self.pathEdit.text().strip().replace('.docx', '.pdf').replace( ".doc", '.pdf')) pdf = camelot.read_pdf(self.pathEdit.text().strip().replace( '.docx', '.pdf').replace(".doc", '.pdf'), flavor='stream', pages=self.currentPageEdit.text().strip()) if pdf: plt = camelot.plot(pdf[0], kind='textedge') plt.show() axis('tight') fig = pylab.gcf() fig.canvas.set_window_title( "第" + self.currentPageEdit.text().strip() + "页表格解析示意图")
def test_textedge_plot(): filename = os.path.join(testdir, "tabula/12s0324.pdf") tables = camelot.read_pdf(filename, flavor="stream") return camelot.plot(tables[0], kind="textedge")
import os path = os.getcwd() ''' leggo il file pdf selezionando le pagine di mio interesse e indico l'area del foglio tramite table_areas dove ['x_leftup, yleftup, xrightdown, yrightdown'] (le coordinate vanno messe in termini di pixel e le righe 16 e 17 vi aiutano a farlo). L'attributo flavor non ho ben capito a cosa serve ma ho visto che i csv escono meglio. ''' df = camelot.read_pdf(path + "/ENAC_Traffic_data_2017_en.pdf", pages='45,46,47', flavor='stream', table_areas=['30,690,552,16']) # in questo modo è possibile vedere le aree che il pacchetto riesce a individuare camelot.plot(df[0], kind='text') show() df.export(path + '/Risultati/' + 'boh.csv', f='csv') path = "D:/OneDrive/OneDrive - Universita' degli Studi di Roma Tor Vergata/Python/Scraping/Risultati/" ''' Siccome camelot crea un file csv per ogni tabella che individua, creato questo piccolo script che legge ciascun csv e li unisce in un dataframe. l'attributo names permette di assegnare un nome ad ogni colonna. thousands permette alla libreria pandas di leggere numeri in un formato dove i seperatori delle migliaia sono punti ''' df_from_each_file = (pd.read_csv( path + f, names=['partenza', 'arrivo', 'paese', 'passeggeri'], thousands=r'.') for f in os.listdir(path))
def test_lattice_contour_plot(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) return camelot.plot(tables[0], kind="contour")
def test_joint_plot(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) return camelot.plot(tables[0], kind="joint")
r'C:\Users\udaym\Desktop\CP-ML\Petchem\250616004\250616004B.pdf') tables_1_b[0] #Table Shape tables_1_b[0].parsing_report doc1_tabdfB = tables_1_b[0].df tables_1_c = camelot.read_pdf(r'C:\Users\udaym\Desktop\CP-ML\phototest.pdf', flavor='stream', row_tol=10) tables_1_c[0] #Table Shape tables_1_c[0].parsing_report camelot.plot(tables_1_c[0], kind='grid') doc1_tabdfC = tables_1_c[0].df #250598825 tables_2_a = camelot.read_pdf( r'C:\Users\Prudhvi\Desktop\petchem\invoice_data\page_docC250_1ser.pdf') tables_2_a[0] #Table Shape tables_2_a[1] tables_2_a[1].parsing_report camelot.plot(tables_2_a[0], kind='grid') doc2_tabdfA = tables_2_a[1].df #doc2_tabdfA_new = pd.Series('doc2_tabdfA') doc2_tabdfA_new = []
20def OnlinePdfToTxt(dataIo,new_path): 21 # 创建一个文档分析器 22 parser = PDFParser(dataIo) 23 # 创建一个PDF文档对象存储文档结构 24 document = PDFDocument(parser) 25 # 判断文件是否允许文本提取 26 if not document.is_extractable: 27 raise PDFTextExtractionNotAllowed 28 else: 29 # 创建一个PDF资源管理器对象来存储资源 30 resmag =PDFResourceManager() 31 # 设定参数进行分析 32 laparams=LAParams() 33 # 创建一个PDF设备对象 34 # device=PDFDevice(resmag ) 35 device=PDFPageAggregator(resmag ,laparams=laparams) 36 # 创建一个PDF解释器对象 37 interpreter=PDFPageInterpreter(resmag ,device) 38 # 处理每一页 39 for page in PDFPage.create_pages(document): 40 interpreter.process_page(page) 41 # 接受该页面的LTPage对象 42 layout=device.get_result() 43 for y in layout: 44 try: 45 if(isinstance(y,LTTextBoxHorizontal)): 46 with open('%s'%(new_path),'a',encoding="utf-8") as f: 47 f.write(y.get_text()+'\n') 48 print("读入成功!") 49 except: 50 print("读入失败!") 51 52# 获取文件的路径 53url = "file:///I:/Python3.6/patest/PdfTest/pdftestto.pdf" 54html = urllib.request.urlopen(urllib.request.Request(url)).read() 55dataIo = BytesIO(html) 56OnlinePdfToTxt(dataIo,'d.txt') import pdfplumber 2import re 3import json 4 5path = 'I:\Python3.6\patest\PdfTest\\numberTest 1.pdf' # 待读取的PDF文件的路径 6pdf = pdfplumber.open(path) 7 8for page in pdf.pages: 9 # print(page.extract_text()) 10 for pdf_table in page.extract_tables(): 11 table = [] 12 cells = [] 13 for row in pdf_table: 14 if not any(row): 15 # 如果一行全为空,则视为一条记录结束 16 if any(cells): 17 table.append(cells) 18 cells = [] 19 elif all(row): 20 # 如果一行全不为空,则本条为新行,上一条结束 21 if any(cells): 22 table.append(cells) 23 cells = [] 24 table.append(row) 25 else: 26 if len(cells) == 0: 27 cells = row 28 else: 29 for i in range(len(row)): 30 if row[i] is not None: 31 cells[i] = row[i] if cells[i] is None else cells[i] + row[i] 32 for row in table: 33 data =[re.sub('\s+', '', cell) if cell is not None else None for cell in row] 34 data_list =list(enumerate(data)) 35 # print(json.dumps(data_list, indent=2, ensure_ascii=False)) 36 with open('I:\Python3.6\patest\PdfTest\\numberTest1.json','a',encoding="utf-8") as file: # json文件的存放位置 37 file.write(json.dumps(data_list, ensure_ascii=False)) 38pdf.close() 1import camelot 2 3# 从本地的PDF文件中提取表格数据,pages为pdf的页数,默认为第一页 4tables = camelot.read_pdf('I:\Python3.6\patest\PdfTest\special.pdf', pages='1', flavor='stream') 5 6# 表格信息 7print(tables) 8print(tables[0]) 9# 表格数据 10print(tables[0].data) 1# 从本地的PDF文件中提取表格数据,pages为pdf的页数,默认为第一页 2tables = camelot.read_pdf('I:\Python3.6\patest\PdfTest\special.pdf', pages='1', flavor='stream') 3 4tables[0].to_csv('special1.csv') 1import camelot 2 3 4# 从PDF文件中提取表格 5tables = camelot.read_pdf('I:\Python3.6\patest\PdfTest\\numberTest 1.pdf', pages='1', flavor='stream',strip_text=' .\n') 6 7# 绘制PDF文档的坐标,定位表格所在的位置 8plt= camelot.plot(tables[0],kind='text') 9plt.show() 10 11# 绘制PDF文档的坐标,定位表格所在的位置 12table_df = tables[0].df 13 14print(table_df.head(n=80))
import camelot import tabula import PyPDF2 import re #using camelot tables = camelot.read_pdf( 'Appraisal Report -270 Industrial Boulevard Kearneysville WV.pdf', pages='70', flavour='lattice') camelot.plot(tables[0], kind='contour') df1 = tables[0].df tables[0].to_csv('087-19 04 08 19(C).csv') # ============================================================================= # #using tabula # df = tabula.read_pdf("VA_Salem_1725 W Main Street.pdf", pages='66') # df[0].to_csv('VA_Salem_1725 W Main Street(T).csv') # # ============================================================================= # # tables[0].parsing_report # # df1=tables[0].df # # camelot.plot(tables[0], kind='contour') # # # # ============================================================================= # # ============================================================================= # Open the pdf file object = PyPDF2.PdfFileReader("087-19 04 08 19.pdf") # Get number of pages NumPages = object.getNumPages()
# information about which building contains self-quarantine citizens # Import libraries import camelot import pandas as pd from pandas import DataFrame from datetime import datetime import os import matplotlib.pyplot as plt import numpy as np import pickle # Read Data tables = camelot.read_pdf('data/pdf/self_quarantine.pdf', pages='1') camelot.plot(tables[0], kind='joint') plt.show() sqTable = [] for table in tables: sqTable.append(table.df) sqTable = pd.concat(sqTable) sqTable.columns = ['CaseNo', 'District', 'Address', 'EndDate'] sqTable.District.replace({r'[^\x00-\x7F]+': ''}, regex=True, inplace=True) sqTable.Address.replace({r'[^\x00-\x7F]+': ''}, regex=True, inplace=True) # for address in sqTable['Address']: # print(address)
def test_stream_contour_plot(): filename = os.path.join(testdir, "tabula/12s0324.pdf") tables = camelot.read_pdf(filename, flavor='stream') return camelot.plot(tables[0], kind='contour')
def advanced_usage(): pdf_filepath = "/path/to/sample.pdf" try: tables = camelot.read_pdf(pdf_filepath) #tables = camelot.read_pdf(pdf_filepath, process_background=True) # Specify table areas. # It is useful to specify exact table boundaries. #tables = camelot.read_pdf(pdf_filepath, flavor="stream", table_areas=["316,499,566,337"]) # (left, top, right, bottom). PDF coordinate system. # Specify table regions. # Tables might not lie at the exact coordinates every time but in an approximate region. #tables = camelot.read_pdf(pdf_filepath, table_regions=["170,370,560,270"]) # (left, top, right, bottom). PDF coordinate system. # Specify column separators. #tables = camelot.read_pdf(pdf_filepath, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"]) #tables = camelot.read_pdf(pdf_filepath, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"], split_text=True) # Flag superscripts and subscripts. #tables = camelot.read_pdf(pdf_filepath, flavor="stream", flag_size=True) # Strip characters from text. #tables = camelot.read_pdf(pdf_filepath, flavor="stream", strip_text=" .\n") # Improve guessed table areas. #tables = camelot.read_pdf(pdf_filepath, flavor="stream", edge_tol=500) # Improve guessed table rows. #tables = camelot.read_pdf(pdf_filepath, flavor="stream", row_tol=10) # Detect short lines. #tables = camelot.read_pdf(pdf_filepath, line_scale=40) # Shift text in spanning cells. #tables = camelot.read_pdf(pdf_filepath, line_scale=40, shift_text=[""]) #tables = camelot.read_pdf(pdf_filepath, line_scale=40, shift_text=["r", "b"])) # Copy text in spanning cells. #tables = camelot.read_pdf(pdf_filepath, copy_text=["v"])) # Tweak layout generation # Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. # To deal with such cases, you can tweak PDFMiner's LAParams kwargs to improve layout generation, by passing the keyword arguments as a dict using layout_kwargs in read_pdf(). #tables = camelot.read_pdf(pdf_filepath, layout_kwargs={"detect_vertical": False})) # Use alternate image conversion backends. # When using the Lattice flavor, Camelot uses ghostscript to convert PDF pages to images for line recognition. #tables = camelot.read_pdf(pdf_filepath, backend="ghostscript")) # {"ghostscript", "poppler"}. except IOError as ex: print("File not found, {}: {}.".format(pdf_filepath, ex)) return # REF [site] >> https://camelot-py.readthedocs.io/en/master/api.html #camelot.handlers.PDFHandler class. #camelot.parsers.Stream class. #camelot.parsers.Lattice class. # Visualize. if len(tables) > 0: table = tables[0] camelot.plot(table, kind="text").show() plt.title("Table Text") camelot.plot(table, kind="grid").show() plt.title("Table Grid") camelot.plot(table, kind="contour").show() plt.title("Table Contour") if table.flavor == "lattice": camelot.plot(table, kind="line").show() plt.title("Table Line") camelot.plot(table, kind="joint").show() plt.title("Table Joint") if table.flavor == "stream": camelot.plot(table, kind="textedge").show() plt.title("Table TextEdge") plt.show() else: print("No table found.")
def test_joint_plot_ghostscript(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename, backend="ghostscript") return camelot.plot(tables[0], kind="joint")
def test_grid_plot_poppler(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename, backend="poppler") return camelot.plot(tables[0], kind="grid")
line_tol=10) #tables = camelot.read_pdf(pdf, flavor = 'stream') flag = False while (flag == False): print(tables[0].parsing_report) print( "If the tables accuracy is less than 90 or the whitespace is greater than 30, you might want to change some settings to get a better table read" ) settingToChange = input( "please type one of the following settings to update and get a better read: \n TABLE_AREA, LINE_SCALE, JOINT_TOL, LINE_TOL \n Type \"READY\" to perform another table read. Type \"DONE\" to output finished Excel file\n" ) if settingToChange == "TABLE_AREA": print( "please write down the x and y coordinate of the top left corner and the bottom right corner of the table. \n The coordinates can be found on the provided graph when hovering over a point. \n Exit out of the graph when you have the points ready" ) plt = camelot.plot(tables[0], kind='text').show() tk.mainloop() table_area_string = input( "This data should be entered in the form of \"x1,y1,x2,y2\" \n") if settingToChange == "LINE_SCALE": nline_scale = input( "please enter the line scale you'd like to use. (default is 15) \n" ) if settingToChange == "JOINT_TOL": njoint_tol = input( "please enter the joint tol you'd like to use. (default is 2) \n") if settingToChange == "LINE_TOL": nline_tol = input( "please enter the line tol you'd like to use. (default is 2) \n") if settingToChange == "READY": if table_area_string and nline_scale and njoint_tol and nline_tol: