def parsePdfMenu(path, debug=False):
    table = getTable(path)
    if (table == None ):
        return

    if (debug):
        print(table)
        print(table.accuracy)
        camelot.plot(table, kind='contour')
        camelot.plot(table, kind='grid')
        plt.show()
    
    week = menu.Week()

    price_info = parsePriceInfo(table.data[0:1])

    for row in table.data[1:]:
        if (len(row) != 6):
            print(len(row)+"rows instead of six, aborting")
            return
        cleanRow = cleanParsedRow(row)
        day = parseMenuForDay(cleanRow)
        day.addPrices(price_info)
        week.addNextDay(day)

    return week
Esempio n. 2
0
def get_pdf_information():
    tables = camelot.read_pdf('UFF.pdf',
                              pages='3',
                              flavor='stream',
                              table_area=['68,710, 545,140'
                                          ])  #strip_text=' \n')
    camelot.plot(tables[0], kind='text')
    #plt.show()                                              # Leave it uncommented when needed
    table_df = tables[0].df
    print(type(table_df))
    table_df[7:71].to_csv('LJ_data.csv')
Esempio n. 3
0
def exportPDFTableDataToCSV():
    root_dir = os.getcwd()
    pdf_file = os.path.join(root_dir, 'foo.pdf')
    tables = camelot.read_pdf(pdf_file)
    print(tables)

    ## To export all tables present in the pdf into a csv file
    # tables.export('foo.csv', f='csv', compress=True)

    for i in range(len(tables)):
        print("On Table Num: {}".format(i))
        camelot.plot(tables[i], kind='contour')
Esempio n. 4
0
 def __init__(self, parent=None, width=8, height=8, dpi=100):
     fig = Figure(figsize=(width, height),
                  dpi=200)  # 创建一个Figure,注意:该Figure为matplotlib下的figure,不是matplotlib.pyplot下面的figure
     pdf = camelot.read_pdf(r"C:\Users\localhost\Desktop\石家庄市2018年市本级和全市财政总决算报表.pdf", flavor='stream', pages='5')
     if pdf:
         fig = camelot.plot(pdf[0], kind='textedge')
     fig.set_dpi(150)
     axis('tight')
     FigureCanvas.__init__(self, fig)  # 初始化父类
     self.setParent(parent)
def camelot_pdf_parsing():
    pdf_configs = [{
        "path": "data/background_lines.pdf",
        "kwargs": {
            "process_background": True
        }
    }, {
        "path": "data/PhDThesis.pdf",
        "kwargs": {}
    }]

    for pdf_config in pdf_configs:
        print("\n#=== Parsing {} =# \n".format(pdf_config["path"]))
        tables = camelot.read_pdf(pdf_config["path"],
                                  **pdf_config["kwargs"])  # , pages="1"
        print(tables)

        for ii, table in enumerate(tables):
            table.to_csv(pdf_config["path"].replace(".pdf", f"_{ii}.csv"))
            print(table.df)

            camelot.plot(table, kind='text')
            camelot.plot(table, kind='grid')
            plt.show()
def extract_table(table):
    try:
        pdf_file_path = pdf_files_folder.joinpath(f"{table.fileId}.pdf")
        table_areas = [
            f"{table.pdfX1},{table.pdfY1},{table.pdfX2},{table.pdfY2}"
        ]
        tables = camelot.read_pdf(str(pdf_file_path),
                                  table_areas=table_areas,
                                  pages=str(table.page),
                                  strip_text='\n',
                                  line_scale=40,
                                  flag_size=True)
        print(f"found {len(tables)} tables with lattice")
        if len(tables) == 0:
            tables = camelot.read_pdf(
                str(pdf_file_path),
                table_areas=table_areas,
                pages=str(table.page),
                strip_text='\n',
                flavor="stream",
                flag_size=True,
            )
            print(f"found {len(tables)} tables with stream")
        if len(tables) > 0:
            csv_file_name = csv_tables_folder_path.joinpath(
                f"{table.uuid}.csv")
            tables[0].to_csv(csv_file_name, index=False, header=False)
            df = pd.read_csv(
                csv_file_name,
                na_filter=False,
                skip_blank_lines=False,
                header=None,
            )
            df.to_html(html_tables_folder_path.joinpath(f"{table.uuid}.html"),
                       index=False,
                       header=False,
                       encoding="utf-8-sig",
                       na_rep=" ")
            fig = camelot.plot(tables[0], kind='contour')
            fig.suptitle(table.uuid)
            plt.show()
        else:
            print(f">>>> No tables found for table ID {table.uuid}")
    except Exception as e:
        print(f"==== Error extracting table ID {table.uuid}  ======")
        print(e)
        print(f"======================================")
Esempio n. 7
0
 def on_check_btn_clicked(self):
     # self.find_code_by_name()
     if self.currentPageEdit.text().strip() == "" or self.pathEdit.text(
     ).strip() == "":
         QMessageBox.information(self, "提示", '    输入不能为空!    ')
         return
     elif not self.pathEdit.text().strip().endswith(".pdf"):
         QMessageBox.information(self, "提示", '    只有PDF文件需要次操作!    ')
         return
     else:
         print(self.pathEdit.text().strip().replace('.docx',
                                                    '.pdf').replace(
                                                        ".doc", '.pdf'))
         pdf = camelot.read_pdf(self.pathEdit.text().strip().replace(
             '.docx', '.pdf').replace(".doc", '.pdf'),
                                flavor='stream',
                                pages=self.currentPageEdit.text().strip())
         if pdf:
             plt = camelot.plot(pdf[0], kind='textedge')
             plt.show()
             axis('tight')
             fig = pylab.gcf()
             fig.canvas.set_window_title(
                 "第" + self.currentPageEdit.text().strip() + "页表格解析示意图")
Esempio n. 8
0
def test_textedge_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
    tables = camelot.read_pdf(filename, flavor="stream")
    return camelot.plot(tables[0], kind="textedge")
Esempio n. 9
0
import os

path = os.getcwd()
'''
leggo il file pdf selezionando le pagine di mio interesse e indico l'area del foglio tramite table_areas 
dove ['x_leftup, yleftup, xrightdown, yrightdown'] (le coordinate vanno messe in termini di pixel e le 
righe 16 e 17 vi aiutano a farlo). L'attributo flavor non ho ben capito a cosa serve ma ho visto che 
i csv escono meglio. 
'''
df = camelot.read_pdf(path + "/ENAC_Traffic_data_2017_en.pdf",
                      pages='45,46,47',
                      flavor='stream',
                      table_areas=['30,690,552,16'])

# in questo modo è possibile vedere le aree che il pacchetto riesce a individuare
camelot.plot(df[0], kind='text')
show()

df.export(path + '/Risultati/' + 'boh.csv', f='csv')

path = "D:/OneDrive/OneDrive - Universita' degli Studi di Roma Tor Vergata/Python/Scraping/Risultati/"
'''
Siccome camelot crea un file csv per ogni tabella che individua, creato questo piccolo script che legge 
ciascun csv e li unisce in un dataframe. l'attributo names permette di assegnare un nome ad ogni colonna. 
thousands permette alla libreria pandas di leggere numeri in un formato dove i seperatori delle migliaia 
sono punti  
'''
df_from_each_file = (pd.read_csv(
    path + f,
    names=['partenza', 'arrivo', 'paese', 'passeggeri'],
    thousands=r'.') for f in os.listdir(path))
Esempio n. 10
0
def test_lattice_contour_plot():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
    return camelot.plot(tables[0], kind="contour")
Esempio n. 11
0
def test_joint_plot():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
    return camelot.plot(tables[0], kind="joint")
Esempio n. 12
0
    r'C:\Users\udaym\Desktop\CP-ML\Petchem\250616004\250616004B.pdf')

tables_1_b[0]  #Table Shape

tables_1_b[0].parsing_report

doc1_tabdfB = tables_1_b[0].df

tables_1_c = camelot.read_pdf(r'C:\Users\udaym\Desktop\CP-ML\phototest.pdf',
                              flavor='stream',
                              row_tol=10)

tables_1_c[0]  #Table Shape

tables_1_c[0].parsing_report
camelot.plot(tables_1_c[0], kind='grid')

doc1_tabdfC = tables_1_c[0].df

#250598825

tables_2_a = camelot.read_pdf(
    r'C:\Users\Prudhvi\Desktop\petchem\invoice_data\page_docC250_1ser.pdf')

tables_2_a[0]  #Table Shape
tables_2_a[1]
tables_2_a[1].parsing_report
camelot.plot(tables_2_a[0], kind='grid')
doc2_tabdfA = tables_2_a[1].df
#doc2_tabdfA_new = pd.Series('doc2_tabdfA')
doc2_tabdfA_new = []
Esempio n. 13
0
20def OnlinePdfToTxt(dataIo,new_path):
21    # 创建一个文档分析器
22    parser = PDFParser(dataIo)
23    # 创建一个PDF文档对象存储文档结构
24    document = PDFDocument(parser)
25    # 判断文件是否允许文本提取
26    if not document.is_extractable:
27        raise PDFTextExtractionNotAllowed
28    else:
29        # 创建一个PDF资源管理器对象来存储资源
30        resmag =PDFResourceManager()
31        # 设定参数进行分析
32        laparams=LAParams()
33        # 创建一个PDF设备对象
34        # device=PDFDevice(resmag )
35        device=PDFPageAggregator(resmag ,laparams=laparams)
36        # 创建一个PDF解释器对象
37        interpreter=PDFPageInterpreter(resmag ,device)
38        # 处理每一页
39        for page in PDFPage.create_pages(document):
40            interpreter.process_page(page)
41            # 接受该页面的LTPage对象
42            layout=device.get_result()
43            for y in layout:
44                try:
45                    if(isinstance(y,LTTextBoxHorizontal)):
46                        with open('%s'%(new_path),'a',encoding="utf-8") as f:
47                            f.write(y.get_text()+'\n')
48                            print("读入成功!")
49                except:
50                    print("读入失败!")
51
52# 获取文件的路径
53url = "file:///I:/Python3.6/patest/PdfTest/pdftestto.pdf"
54html = urllib.request.urlopen(urllib.request.Request(url)).read()
55dataIo = BytesIO(html)
56OnlinePdfToTxt(dataIo,'d.txt')





import pdfplumber
 2import re
 3import json
 4
 5path = 'I:\Python3.6\patest\PdfTest\\numberTest 1.pdf'  # 待读取的PDF文件的路径
 6pdf = pdfplumber.open(path)
 7
 8for page in pdf.pages:
 9    # print(page.extract_text())
10    for pdf_table in page.extract_tables():
11        table = []
12        cells = []
13        for row in pdf_table:
14            if not any(row):
15                # 如果一行全为空,则视为一条记录结束
16                if any(cells):
17                    table.append(cells)
18                    cells = []
19            elif all(row):
20                # 如果一行全不为空,则本条为新行,上一条结束
21                if any(cells):
22                    table.append(cells)
23                    cells = []
24                table.append(row)
25            else:
26                if len(cells) == 0:
27                    cells = row
28                else:
29                    for i in range(len(row)):
30                        if row[i] is not None:
31                            cells[i] = row[i] if cells[i] is None else cells[i] + row[i]
32        for row in table:
33            data =[re.sub('\s+', '', cell) if cell is not None else None for cell in row]
34            data_list =list(enumerate(data))
35            # print(json.dumps(data_list, indent=2, ensure_ascii=False))
36            with open('I:\Python3.6\patest\PdfTest\\numberTest1.json','a',encoding="utf-8") as file:   # json文件的存放位置
37                file.write(json.dumps(data_list, ensure_ascii=False))
38pdf.close()



1import camelot
 2
 3# 从本地的PDF文件中提取表格数据,pages为pdf的页数,默认为第一页
 4tables = camelot.read_pdf('I:\Python3.6\patest\PdfTest\special.pdf', pages='1', flavor='stream')
 5
 6# 表格信息
 7print(tables)
 8print(tables[0])
 9# 表格数据
10print(tables[0].data)


1# 从本地的PDF文件中提取表格数据,pages为pdf的页数,默认为第一页
2tables = camelot.read_pdf('I:\Python3.6\patest\PdfTest\special.pdf', pages='1', flavor='stream')
3
4tables[0].to_csv('special1.csv')


1import camelot
 2
 3
 4# 从PDF文件中提取表格
 5tables = camelot.read_pdf('I:\Python3.6\patest\PdfTest\\numberTest 1.pdf', pages='1', flavor='stream',strip_text=' .\n')
 6
 7# 绘制PDF文档的坐标,定位表格所在的位置
 8plt= camelot.plot(tables[0],kind='text')
 9plt.show()
10
11# 绘制PDF文档的坐标,定位表格所在的位置
12table_df = tables[0].df
13
14print(table_df.head(n=80))
import camelot
import tabula
import PyPDF2
import re
#using camelot
tables = camelot.read_pdf(
    'Appraisal Report -270 Industrial Boulevard Kearneysville WV.pdf',
    pages='70',
    flavour='lattice')
camelot.plot(tables[0], kind='contour')
df1 = tables[0].df
tables[0].to_csv('087-19 04 08 19(C).csv')

# =============================================================================
# #using tabula
# df = tabula.read_pdf("VA_Salem_1725 W Main Street.pdf", pages='66')
# df[0].to_csv('VA_Salem_1725 W Main Street(T).csv')
# # =============================================================================
# # tables[0].parsing_report
# # df1=tables[0].df
# # camelot.plot(tables[0], kind='contour')
# #
# # =============================================================================
#
# =============================================================================

# Open the pdf file
object = PyPDF2.PdfFileReader("087-19 04 08 19.pdf")

# Get number of pages
NumPages = object.getNumPages()
# information about which building contains self-quarantine citizens

# Import libraries
import camelot
import pandas as pd
from pandas import DataFrame
from datetime import datetime
import os
import matplotlib.pyplot as plt
import numpy as np
import pickle

# Read Data
tables = camelot.read_pdf('data/pdf/self_quarantine.pdf', pages='1')

camelot.plot(tables[0], kind='joint')
plt.show()

sqTable = []
for table in tables:
    sqTable.append(table.df)

sqTable = pd.concat(sqTable)

sqTable.columns = ['CaseNo', 'District', 'Address', 'EndDate']

sqTable.District.replace({r'[^\x00-\x7F]+': ''}, regex=True, inplace=True)
sqTable.Address.replace({r'[^\x00-\x7F]+': ''}, regex=True, inplace=True)

# for address in sqTable['Address']:
# 	print(address)
Esempio n. 16
0
def test_stream_contour_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
    tables = camelot.read_pdf(filename, flavor='stream')
    return camelot.plot(tables[0], kind='contour')
Esempio n. 17
0
def advanced_usage():
    pdf_filepath = "/path/to/sample.pdf"

    try:
        tables = camelot.read_pdf(pdf_filepath)
        #tables = camelot.read_pdf(pdf_filepath, process_background=True)

        # Specify table areas.
        # 	It is useful to specify exact table boundaries.
        #tables = camelot.read_pdf(pdf_filepath, flavor="stream", table_areas=["316,499,566,337"])  # (left, top, right, bottom). PDF coordinate system.
        # Specify table regions.
        #	Tables might not lie at the exact coordinates every time but in an approximate region.
        #tables = camelot.read_pdf(pdf_filepath, table_regions=["170,370,560,270"])  # (left, top, right, bottom). PDF coordinate system.
        # Specify column separators.
        #tables = camelot.read_pdf(pdf_filepath, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"])
        #tables = camelot.read_pdf(pdf_filepath, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"], split_text=True)

        # Flag superscripts and subscripts.
        #tables = camelot.read_pdf(pdf_filepath, flavor="stream", flag_size=True)
        # Strip characters from text.
        #tables = camelot.read_pdf(pdf_filepath, flavor="stream", strip_text=" .\n")

        # Improve guessed table areas.
        #tables = camelot.read_pdf(pdf_filepath, flavor="stream", edge_tol=500)
        # Improve guessed table rows.
        #tables = camelot.read_pdf(pdf_filepath, flavor="stream", row_tol=10)
        # Detect short lines.
        #tables = camelot.read_pdf(pdf_filepath, line_scale=40)

        # Shift text in spanning cells.
        #tables = camelot.read_pdf(pdf_filepath, line_scale=40, shift_text=[""])
        #tables = camelot.read_pdf(pdf_filepath, line_scale=40, shift_text=["r", "b"]))
        # Copy text in spanning cells.
        #tables = camelot.read_pdf(pdf_filepath, copy_text=["v"]))

        # Tweak layout generation
        #	Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences.
        # 	To deal with such cases, you can tweak PDFMiner's LAParams kwargs to improve layout generation, by passing the keyword arguments as a dict using layout_kwargs in read_pdf().
        #tables = camelot.read_pdf(pdf_filepath, layout_kwargs={"detect_vertical": False}))

        # Use alternate image conversion backends.
        #	When using the Lattice flavor, Camelot uses ghostscript to convert PDF pages to images for line recognition.
        #tables = camelot.read_pdf(pdf_filepath, backend="ghostscript"))  # {"ghostscript", "poppler"}.
    except IOError as ex:
        print("File not found, {}: {}.".format(pdf_filepath, ex))
        return

    # REF [site] >> https://camelot-py.readthedocs.io/en/master/api.html
    #camelot.handlers.PDFHandler class.
    #camelot.parsers.Stream class.
    #camelot.parsers.Lattice class.

    # Visualize.
    if len(tables) > 0:
        table = tables[0]

        camelot.plot(table, kind="text").show()
        plt.title("Table Text")
        camelot.plot(table, kind="grid").show()
        plt.title("Table Grid")
        camelot.plot(table, kind="contour").show()
        plt.title("Table Contour")
        if table.flavor == "lattice":
            camelot.plot(table, kind="line").show()
            plt.title("Table Line")
            camelot.plot(table, kind="joint").show()
            plt.title("Table Joint")
        if table.flavor == "stream":
            camelot.plot(table, kind="textedge").show()
            plt.title("Table TextEdge")

        plt.show()
    else:
        print("No table found.")
Esempio n. 18
0
def test_joint_plot_ghostscript():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename, backend="ghostscript")
    return camelot.plot(tables[0], kind="joint")
Esempio n. 19
0
def test_grid_plot_poppler():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename, backend="poppler")
    return camelot.plot(tables[0], kind="grid")
Esempio n. 20
0
                          line_tol=10)
#tables = camelot.read_pdf(pdf, flavor = 'stream')
flag = False
while (flag == False):
    print(tables[0].parsing_report)
    print(
        "If the tables accuracy is less than 90 or the whitespace is greater than 30, you might want to change some settings to get a better table read"
    )
    settingToChange = input(
        "please type one of the following settings to update and get a better read: \n TABLE_AREA, LINE_SCALE, JOINT_TOL, LINE_TOL \n Type \"READY\" to perform another table read. Type \"DONE\" to output finished Excel file\n"
    )
    if settingToChange == "TABLE_AREA":
        print(
            "please write down the x and y coordinate of the top left corner and the bottom right corner of the table. \n The coordinates can be found on the provided graph when hovering over a point. \n Exit out of the graph when you have the points ready"
        )
        plt = camelot.plot(tables[0], kind='text').show()
        tk.mainloop()
        table_area_string = input(
            "This data should be entered in the form of \"x1,y1,x2,y2\" \n")
    if settingToChange == "LINE_SCALE":
        nline_scale = input(
            "please enter the line scale you'd like to use. (default is 15) \n"
        )
    if settingToChange == "JOINT_TOL":
        njoint_tol = input(
            "please enter the joint tol you'd like to use. (default is 2) \n")
    if settingToChange == "LINE_TOL":
        nline_tol = input(
            "please enter the line tol you'd like to use. (default is 2) \n")
    if settingToChange == "READY":
        if table_area_string and nline_scale and njoint_tol and nline_tol: