Python read_pdfの例、camelot.io.read_pdf Pythonの例

コード例 #1

0

ファイルを表示

def generateCSV(filename):
    """
    Read PDF files and then create a CSV equivalent.
    Arguments:
        filename -- PDF to read without .pdf extension
    """
    print(".", end='')
    # Get path and number of pages
    n_pages = getPagesNumber(filename)
    file_path = f'api_covid19/files/{filename}.pdf'

    # Convert PDF to CSV
    print(".", end='')
    tables = camelot.read_pdf(file_path, pages=f'1-{n_pages}', split_text=True)
    print(".", end='')
    tables.export(f'api_covid19/files/intermediate_{filename}.csv', f='csv', compress=False)
    print(".", end='')

    # Merge generated CSV files into just one
    all_filenames = [i for i in sorted(glob.glob(f'api_covid19/files/intermediate_{filename}*.csv'))]
    combined_csv = pd.read_csv(all_filenames[0])
    print(".", end='')

    for idx, f in enumerate(all_filenames):
        if idx > 0:
            df = pd.read_csv(f, header=None)
            df.columns = combined_csv.columns
            combined_csv = combined_csv.append(df)

    print(".", end='')
    combined_csv.to_csv(f'api_covid19/files/{filename}.csv', index=False, encoding='utf-8-sig')

    # Finally remove intermediate CSV files
    for f in all_filenames:
        os.remove(f)

コード例 #2

0

ファイルを表示

    def bankB(self):
        #Read bankB statement 
        tables = camelot.read_pdf(self.path, flavor='stream', columns=['72,95,209,327,442,529'],table_areas=['0,792,800,100'])

        #Export pages of bankB pdf
        tables.export('./bankB.csv', f='csv')
        #read data from the csv file
        df=pandas.read_csv('./bankB-page-1-table-1.csv',skiprows=4)

        # Merge Information and Empty Columns 
        df['InformationReplacing'] = df['Information'].fillna(df['Unnamed: 2'])

        # drop the Information and Unamed columns
        df['Information'] = df['InformationReplacing']
        df.to_csv('./tempB1.csv')

        
        #read tempB1 file
        df1=pandas.read_csv('./tempB1.csv')

        #Drop Unnecessary columns 
        for idx,columnName in enumerate(df1.columns):
            if("Unnamed" in columnName):
                df1.drop(columnName,axis=1,inplace=True)
        df1.drop('InformationReplacing',axis=1,inplace=True)

        #output bankB Solution
        df1.to_csv('./bankBSolution.csv')
        os.remove('./bankB-page-1-table-1.csv')
        os.remove('./tempb1.csv')

コード例 #3

0

ファイルを表示

 def bankA(self):
     
     #Read bankA statement
     tables = camelot.read_pdf(self.path,pages='all', flavor='stream')
     #Export each page of bankA pdf
     tables.export('./bank.csv', f='csv')
     self.removeExtraColumnsBankA()
     self.concatenateBankA()

コード例 #4

0

ファイルを表示

    def bankC(self):
        #Read bankC statement 
        #table area to keep only top table
        tables = camelot.read_pdf(self.path, flavor='stream',table_areas=['0,792,800,400'])

        #Export pages of bankC
        tables.export('./bankC.csv', f='csv')

        #read csv file
        df=pandas.read_csv('./bankC-page-1-table-1.csv',skiprows=1)

        #output bankC Solution
        df.to_csv('./bankCSolution.csv')
        os.remove('./bankC-page-1-table-1.csv')

コード例 #5

0

ファイルを表示

ファイル: predict_table.py プロジェクト: mjdhasan/Parsing-PDFs-using-YOLOV3

def detect_tables(opt):
    pdf_file = opt.pdf_path
    pg = opt.page

    see_example = False
    img_path = pdf_file[:-4] + "-" + str(pg) + ".jpg"
    pdf_page = norm_pdf_page(pdf_file, pg)
    img = pdf_page2img(pdf_file, pg, save_image=True)

    opt = parameters(img_path)
    output_detect = detectTable(opt)
    output = outpout_yolo(output_detect)

    os.remove(img_path)
    os.rmdir("outputs")

    if see_example:
        for out in output:
            [[x1_img, y1_img, x2_img, y2_img], [w_table, h_table],
             [H_img, W_img]] = img_dim(img, out)
            plt.plot([x1_img, x2_img, x2_img, x1_img, x1_img],
                     [y1_img, y1_img, y2_img, y2_img, y1_img],
                     linestyle='-.',
                     alpha=0.7)
            # plt.scatter([x1_img, x2_img], [y1_img, y2_img])
        imgplot = plt.imshow(img)
        plt.savefig(pdf_file[:-4] + "-" + str(pg) + ".png")

    interesting_areas = []
    for x in output:
        [x1, y1, x2, y2] = bboxes_pdf(img, pdf_page, x)
        bbox_camelot = [
            ",".join([str(x1), str(y1), str(x2),
                      str(y2)])
        ][0]  # x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space
        interesting_areas.append(bbox_camelot)

    output_camelot = camelot.read_pdf(filepath=pdf_file,
                                      pages=str(pg),
                                      flavor="stream",
                                      table_areas=interesting_areas)
    output_camelot = [x.df for x in output_camelot]
    for i, db in enumerate(output_camelot):
        db.to_excel(pdf_file[:-4] + "-" + str(pg) + "-table-" + str(i) +
                    ".xlsx")

コード例 #6

0

ファイルを表示

def main():
    from camelot.io import read_pdf

    filename = "/Users/vijender/indvision-data/acctstmt_d_xxxxxx591m_emailacctstmt_unlckd.pdf"
    layouts, pq_obj, dimensions = faster_load.load_pdf_and_layout(filename, {})
    preprocess_kwargs = {'layouts': layouts, 'dimensions': dimensions}
    # for layout in layouts[0]:
    #     print(layout.text)
    tables = read_pdf(filepath=filename,
                      flavor='stream',
                      table_areas=["0,448,605,390"],
                      pages="1",
                      row_tol=5,
                      column_tol=0,
                      edge_tol=100,
                      num_columns=7,
                      preprocess_kwargs=preprocess_kwargs)
    print(tables)
    for table in tables:
        data = table.df
        pprint(data)

コード例 #7

0

ファイルを表示

def read_pdf(filepath,
             pages: str = "1",
             password: str = '',
             flavor: str = "camelotPro",
             pro_kwargs: dict = None,
             suppress_stdout: bool = False,
             layout_kwargs: dict = None,
             **kwargs):
    """
    Read PDF and return extracted tables.
    Parameters described below are exclusive for CamelotPro.
    Please refer to the docstrings from Camelot.read_pdf for information on other parameters
    <https://github.com/atlanhq/camelot/blob/master/camelot/io.py#L9>

    Parameters
    ----------
    flavor : str (default: 'lattice') [Case-Insensitive]
        The parsing method to use ('lattice' or 'stream' or 'CamelotPro').

    pro_kwargs: dict, Must Need (if flavor is "CamelotPro")
        A dict of (
            {
                "api_key": str,
                Mandatory, to trigger "CamelotPro" flavor, to process Scan PDFs and images, also text PDF files

                "job_id": str,
                    empty, to process a new file
                    Mandatory, to retrieve the result of the already submitted file

                "dup_check": bool, default: False - to bypass the duplicate check
                    Useful to handle duplicate requests, check based on the FileName

                "max_wait_time": int, default: 300
                    Checks for the output every 15 seconds until successfully processed or for a maximum of 300 seconds.
            }
        )

    Returns
    -------
    tables : camelot.core.TableList
    """
    pro_flavors = tuple(["camelotpro", "camelot_pro", "pro"])

    if pro_kwargs is None:
        pro_kwargs = {}

    flavor = flavor.lower()
    if flavor in pro_flavors or any(
        [kwa.lower() in pro_flavors for kwa in kwargs]):
        if kwargs.pop("password", ""):
            raise IOError(
                "Pro version does not support the password protected files")

        max_wait_time = int(pro_kwargs.pop("max_wait_time", 300))
        dup_check = pro_kwargs.pop("dup_check", False)

        et_sess = ExtractTable(api_key=pro_kwargs["api_key"])
        if not pro_kwargs.get("job_id", ""):
            et_sess.process_file(filepath,
                                 pages=pages,
                                 output_format="df",
                                 dup_check=dup_check,
                                 max_wait_time=max_wait_time,
                                 library="camelotpro")
        else:
            et_sess.get_result(pro_kwargs["job_id"],
                               max_wait_time=max_wait_time)

        gp_resp = et_sess.ServerResponse.json()
        from camelot_pro.doppelganger import table_list
        tables = table_list(gp_resp)
    else:
        from camelot.io import read_pdf
        tables = read_pdf(filepath=filepath,
                          pages=pages,
                          password=password,
                          flavor=flavor,
                          suppress_stdout=suppress_stdout,
                          layout_kwargs=layout_kwargs if layout_kwargs else {},
                          **kwargs)
        if not tables:
            notify(try_pro)
    return tables

コード例 #8

0

ファイルを表示

def read_pdf(filepath,
             pages="1",
             password=None,
             flavor="lattice",
             suppress_stdout=False,
             layout_kwargs={},
             pro_kwargs=None,
             **kwargs):
    """
    Read PDF and return extracted tables.
    Parameters described below are exclusive for CamelotPro.
    Please refer to the docstrings from Camelot.read_pdf for information on other parameters
    <https://github.com/atlanhq/camelot/blob/master/camelot/io.py#L9>

    Parameters
    ----------
    flavor : str (default: 'lattice') [Case-Insensitive]
        The parsing method to use ('lattice' or 'stream' or 'CamelotPro').

    pro_kwargs: dict, Must Need (if flavor is "CamelotPro")
        A dict of (
            {
                "api_key": str,
                Mandatory, to trigger "CamelotPro" flavor, to process Scan PDFs and images, also text PDF files

                "job_id": str,
                    optional, if processing a new file
                    Mandatory, to retrieve the result of already submitted file

                "dup_check": bool, default: False - to bypass the duplicate check
                    Useful to handle duplicate requests, check based on the FileName

                "wait_for_output": bool, default: True
                    Loops and check for the output for a maximum of 300 seconds, before the process exits as an output.
                    with 20 second gap in between retries
                        - If the process will return the output before 300 seconds, when the processing is successful
                        - Alternatively, a big file process can always be tracked using the ".JobId" from the output
            }
        )

    Returns
    -------
    tables : camelot.core.TableList
    """
    if pro_kwargs is None:
        pro_kwargs = {}
    flavor = flavor.lower()
    if flavor == "camelotpro":
        from camelot_pro.gopro import GoPro
        from camelot_pro.doppelganger import table_list
        going_pro = GoPro(pro_kwargs.get("api_key", ""))
        gone_pro = going_pro.validate_api_key()
        if not pro_kwargs.get("job_id", ""):
            gp_resp = gone_pro.trigger(filepath,
                                       pages,
                                       password=password,
                                       dup_check=pro_kwargs.get(
                                           "dup_check", False))
        else:
            gp_resp = gone_pro.get_tables(pro_kwargs["job_id"])

        # Added default wait time, because early users are confused of no output
        pro_kwargs["wait_for_output"] = pro_kwargs.get("wait_for_output", True)

        if gp_resp["JobStatus"].lower().startswith(
                "process") and pro_kwargs["wait_for_output"]:
            max_wait = 300
            check_freq = 20
            while max_wait > 0 and gp_resp["JobStatus"].lower().startswith(
                    "process"):
                print(
                    f'[Info]: Please wait, the Job is: {gp_resp["JobStatus"]} ..'
                )
                max_wait -= check_freq
                time.sleep(check_freq)
                gp_resp = gone_pro.get_tables(job_id=gp_resp["JobId"])
        tables = table_list(gp_resp)
    else:
        from camelot.io import read_pdf
        tables = read_pdf(filepath=filepath,
                          pages=pages,
                          password=password,
                          flavor=flavor,
                          suppress_stdout=suppress_stdout,
                          layout_kwargs=layout_kwargs,
                          **kwargs)
        if not tables:
            notify(try_pro)
    return tables

コード例 #9

0

ファイルを表示

def run_stream_parse():
    
    tables = io.read_pdf(sample_pdf_file, flavor='stream')
    print(tables);