def get_salary_from_pdf(file_path: str):
    return sum(
        convert_to_int(salary, file_path)
        for salary in camelot.read_pdf(file_path)[0].df[7][1:])
Example #2
0
def do_tablextract(self, g, pdf_path, p_num):  # g is globals
    print('Starting tablextract')
    camelot_method = 'lattice'  #stream/lattice

    if self.pdf_type == 'normal':
        print(pdf_path, p_num)
        if 'tabula' in g.text_pdf_method:
            tables = read_pdf(
                pdf_path,
                pages=[p_num],
                multiple_tables=True,
                java_options=
                '-Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider')
            for i in range(len(tables)):
                table_file_path = '%s/%s-%s' % (self.tables_folder_tabula,
                                                p_num, i)
                # tables[i].fillna('').to_html('%s.html' % (table_file_path))
                try:
                    tables[i].fillna('').to_csv('%s.csv' % (table_file_path),
                                                encoding='utf-8')
                except:
                    tables[i].fillna('').to_csv('%s.csv' % (table_file_path),
                                                encoding='cp1252')
        if 'camelot' in g.text_pdf_method:
            tables = camelot.read_pdf(pdf_path,
                                      flavor=camelot_method,
                                      pages=str(p_num))
            for i in range(len(tables)):
                # print(tables[0].parsing_report)
                table_file_path = '%s/%s-%s.csv' % (self.tables_folder_camelot,
                                                    p_num, i)
                tables.export(table_file_path, f='csv', compress=False)

    else:
        if self.doc_type == 'image':
            # trying camelot
            print('Doing camelot-stream')
            camelot_method = 'stream'  #stream/lattice
            tables = camelot.read_pdf(pdf_path,
                                      flavor=camelot_method,
                                      pages=str(p_num))
            for i in range(len(tables)):
                # print(tables[0].parsing_report)
                table_file_path = '%s/%s-%s.csv' % (self.tables_folder_camelot,
                                                    p_num, i)
                tables.export(table_file_path, f='csv', compress=False)

        # Trying pdftabextract
        filename = os.path.basename(pdf_path).split('.')[0].split('/')[0]
        DATAPATH = self.images_folder  # 'data/'
        INPUT_XML = '%s/%s.xml' % (self.images_folder, filename)
        os.system("pdftohtml -c -hidden -xml -enc UTF-8  -f %s -l %s %s %s" %
                  (p_num, p_num, pdf_path, INPUT_XML))
        # os.system("pdftohtml -c -hidden -f %s -l %s %s %s/%s.html" % (p_num, p_num, pdf_path, self.html_folder, filename))

        # Load the XML that was generated with pdftohtml
        xmltree, xmlroot = read_xml(INPUT_XML)
        # parse it and generate a dict of pages
        pages = parse_pages(xmlroot)
        # print(pages[p_num]['texts'][0])
        p = pages[p_num]

        # Detecting lines
        if self.doc_type == 'image':
            imgfilebasename = '%s-%s_1' % (filename, p_num)
            imgfile = self.file_path
        elif self.doc_type == 'pdf':
            try:
                imgfilebasename = '%s-%s_1' % (filename, p_num)
                imgfile = '%s/%s-%s_1.png' % (DATAPATH, filename, p_num)
            except:
                imgfilebasename = filename + str(p_num)
                imgfile = '%s/%s-%s_1.png' % (DATAPATH, filename, p_num)

        print("\npage %d: detecting lines in image file '%s'..." %
              (p_num, imgfile))

        # create an image processing object with the scanned page
        iproc_obj = imgproc.ImageProc(imgfile)

        # calculate the scaling of the image file in relation to the text boxes coordinate system dimensions
        page_scaling_x = iproc_obj.img_w / p['width']  # scaling in X-direction
        page_scaling_y = iproc_obj.img_h / p[
            'height']  # scaling in Y-direction

        # detect the lines
        lines_hough = iproc_obj.detect_lines(canny_kernel_size=3,
                                             canny_low_thresh=50,
                                             canny_high_thresh=150,
                                             hough_rho_res=1,
                                             hough_theta_res=np.pi / 500,
                                             hough_votes_thresh=round(
                                                 0.2 * iproc_obj.img_w))
        print("> found %d lines" % len(lines_hough))

        # helper function to save an image
        def save_image_w_lines(iproc_obj, imgfilebasename):
            img_lines = iproc_obj.draw_lines(orig_img_as_background=True)
            img_lines_file = os.path.join(
                self.temp_folder, '%s-lines-orig.png' % imgfilebasename)

            print("> saving image with detected lines to '%s'" %
                  img_lines_file)
            cv2.imwrite(img_lines_file, img_lines)

        save_image_w_lines(iproc_obj, imgfilebasename)

        # find rotation or skew
        # the parameters are:
        # 1. the minimum threshold in radians for a rotation to be counted as such
        # 2. the maximum threshold for the difference between horizontal and vertical line rotation (to detect skew)
        # 3. an optional threshold to filter out "stray" lines whose angle is too far apart from the median angle of
        #    all other lines that go in the same direction (no effect here)
        rot_or_skew_type, rot_or_skew_radians = iproc_obj.find_rotation_or_skew(
            radians(0.5),  # uses "lines_hough"
            radians(1),
            omit_on_rot_thresh=radians(0.5))

        # rotate back or deskew text boxes
        needs_fix = True
        if rot_or_skew_type == ROTATION:
            print("> rotating back by %f°" % -degrees(rot_or_skew_radians))
            rotate_textboxes(p, -rot_or_skew_radians, pt(0, 0))
        elif rot_or_skew_type in (SKEW_X, SKEW_Y):
            print("> deskewing in direction '%s' by %f°" %
                  (rot_or_skew_type, -degrees(rot_or_skew_radians)))
            deskew_textboxes(p, -rot_or_skew_radians, rot_or_skew_type,
                             pt(0, 0))
        else:
            needs_fix = False
            print("> no page rotation / skew found")

        if needs_fix:
            # rotate back or deskew detected lines
            lines_hough = iproc_obj.apply_found_rotation_or_skew(
                rot_or_skew_type, -rot_or_skew_radians)

            save_image_w_lines(iproc_obj, imgfilebasename + '-repaired')

        # save repaired XML (i.e. XML with deskewed textbox positions)

        repaired_xmlfile = os.path.join(self.temp_folder,
                                        filename + '.repaired.xml')

        print("saving repaired XML file to '%s'..." % repaired_xmlfile)
        xmltree.write(repaired_xmlfile)

        # Clustering vertical lines
        # cluster the detected *vertical* lines using find_clusters_1d_break_dist as simple clustering function
        # (break on distance MIN_COL_WIDTH/2)
        # additionally, remove all cluster sections that are considered empty
        # a cluster is considered empty when the number of text boxes in it is below 10% of the median number of text boxes
        # per cluster section
        MIN_COL_WIDTH = g.MIN_COL_WIDTH  # minimum width of a column in pixels, measured in the scanned pages
        vertical_clusters = iproc_obj.find_clusters(
            imgproc.DIRECTION_VERTICAL,
            find_clusters_1d_break_dist,
            remove_empty_cluster_sections_use_texts=p[
                'texts'],  # use this page's textboxes
            remove_empty_cluster_sections_n_texts_ratio=0.1,  # 10% rule
            remove_empty_cluster_sections_scaling=
            page_scaling_x,  # the positions are in "scanned image space" -> we scale them to "text box space"
            dist_thresh=MIN_COL_WIDTH / 2)
        print("> found %d clusters" % len(vertical_clusters))

        # draw the clusters
        img_w_clusters = iproc_obj.draw_line_clusters(
            imgproc.DIRECTION_VERTICAL, vertical_clusters)
        save_img_file = os.path.join(
            self.temp_folder, '%s-vertical-clusters.png' % imgfilebasename)
        print("> saving image with detected vertical clusters to '%s'" %
              save_img_file)
        cv2.imwrite(save_img_file, img_w_clusters)

        # Clustering horizontal lines
        # cluster the detected *horizontal* lines using find_clusters_1d_break_dist as simple clustering function
        # (break on distance MIN_ROW_WIDTH/2)
        # additionally, remove all cluster sections that are considered empty
        # a cluster is considered empty when the number of text boxes in it is below 10% of the median number of text boxes
        # per cluster section
        MIN_ROW_WIDTH = g.MIN_ROW_WIDTH  # minimum width of a row in pixels, measured in the scanned pages
        horizontal_clusters = iproc_obj.find_clusters(
            imgproc.DIRECTION_HORIZONTAL,
            find_clusters_1d_break_dist,
            remove_empty_cluster_sections_use_texts=p[
                'texts'],  # use this page's textboxes
            remove_empty_cluster_sections_n_texts_ratio=0.1,  # 10% rule
            remove_empty_cluster_sections_scaling=
            page_scaling_y,  # the positions are in "scanned image space" -> we scale them to "text box space"
            dist_thresh=MIN_ROW_WIDTH / 2)
        print("> found %d clusters" % len(horizontal_clusters))

        # draw the clusters
        img_w_clusters_hoz = iproc_obj.draw_line_clusters(
            imgproc.DIRECTION_HORIZONTAL, horizontal_clusters)
        save_img_file = os.path.join(
            self.temp_folder, '%s-horizontal-clusters.png' % imgfilebasename)
        print("> saving image with detected vertical clusters to '%s'" %
              save_img_file)
        cv2.imwrite(save_img_file, img_w_clusters_hoz)

        page_colpos = np.array(
            calc_cluster_centers_1d(vertical_clusters)) / page_scaling_x
        print('found %d column borders:' % len(page_colpos))
        print(page_colpos)

        page_rowpos = np.array(
            calc_cluster_centers_1d(horizontal_clusters)) / page_scaling_y
        print('found %d row borders:' % len(page_rowpos))
        print(page_rowpos)

        # right border of the second column
        col2_rightborder = page_colpos[2]

        # calculate median text box height
        median_text_height = np.median([t['height'] for t in p['texts']])

        # get all texts in the first two columns with a "usual" textbox height
        # we will only use these text boxes in order to determine the line positions because they are more "stable"
        # otherwise, especially the right side of the column header can lead to problems detecting the first table row
        text_height_deviation_thresh = median_text_height / 2
        texts_cols_1_2 = [
            t for t in p['texts'] if t['right'] <= col2_rightborder
            and abs(t['height'] -
                    median_text_height) <= text_height_deviation_thresh
        ]

        # get all textboxes' top and bottom border positions
        borders_y = border_positions_from_texts(texts_cols_1_2,
                                                DIRECTION_VERTICAL)

        # break into clusters using half of the median text height as break distance
        clusters_y = find_clusters_1d_break_dist(
            borders_y, dist_thresh=median_text_height / 2)
        clusters_w_vals = zip_clusters_and_values(clusters_y, borders_y)

        # for each cluster, calculate the median as center
        pos_y = calc_cluster_centers_1d(clusters_w_vals)
        pos_y.append(p['height'])

        print('number of line positions:', len(pos_y))

        pttrn_table_row_beginning = re.compile(
            r'^[\d Oo][\d Oo]{2,} +[A-ZÄÖÜ]')

        # 1. try to find the top row of the table
        texts_cols_1_2_per_line = split_texts_by_positions(
            texts_cols_1_2,
            pos_y,
            DIRECTION_VERTICAL,
            alignment='middle',
            enrich_with_positions=True)

        # go through the texts line per line
        for line_texts, (line_top, line_bottom) in texts_cols_1_2_per_line:
            line_str = join_texts(line_texts)
            if pttrn_table_row_beginning.match(
                    line_str
            ):  # check if the line content matches the given pattern
                top_y = line_top
                break
        else:
            top_y = 0

        print('Top_y: %s' % top_y)

        # hints for a footer text box
        words_in_footer = ('anzeige', 'annahme', 'ala')

        # 2. try to find the bottom row of the table
        min_footer_text_height = median_text_height * 1.5
        min_footer_y_pos = p['height'] * 0.7
        # get all texts in the lower 30% of the page that have are at least 50% bigger than the median textbox height
        bottom_texts = [
            t for t in p['texts'] if t['top'] >= min_footer_y_pos
            and t['height'] >= min_footer_text_height
        ]
        bottom_texts_per_line = split_texts_by_positions(
            bottom_texts,
            pos_y + [p['height']],  # always down to the end of the page
            DIRECTION_VERTICAL,
            alignment='middle',
            enrich_with_positions=True)
        # go through the texts at the bottom line per line
        page_span = page_colpos[-1] - page_colpos[0]
        min_footer_text_width = page_span * 0.8
        for line_texts, (line_top, line_bottom) in bottom_texts_per_line:
            line_str = join_texts(line_texts)
            has_wide_footer_text = any(t['width'] >= min_footer_text_width
                                       for t in line_texts)
            # check if there's at least one wide text or if all of the required words for a footer match
            if has_wide_footer_text or all_a_in_b(words_in_footer, line_str):
                bottom_y = line_top
                break
        else:
            bottom_y = p['height']

        print(bottom_y)
        print(pos_y)

        # finally filter the line positions so that only the lines between the table top and bottom are left
        print(page_rowpos)
        print("> page %d: %d lines between [%f, %f]" %
              (p_num, len(page_rowpos), top_y, bottom_y))

        def subsequent_pairs(l):
            """
            Return subsequent pairs of values in a list <l>, i.e. [(x1, x2), (x2, x3), (x3, x4), .. (xn-1, xn)] for a
            list [x1 .. xn]
            """
            return [(l[i - 1], v) for i, v in enumerate(l) if i > 0]

        # page_rowpos = [y for y in pos_y if top_y <= y <= bottom_y]
        print(page_colpos, page_rowpos)
        grid = make_grid_from_positions(page_colpos, page_rowpos)
        # print(grid)
        n_rows = len(grid)
        n_cols = len(grid[0])
        print("> page %d: grid with %d rows, %d columns" %
              (p_num, n_rows, n_cols))

        page_grids_file = os.path.join(self.temp_folder,
                                       filename + '_pagegrids.json')
        print("saving page grids JSON file to '%s'" % page_grids_file)
        save_page_grids({p_num: grid}, page_grids_file)

        datatable = fit_texts_into_grid(p['texts'], grid)
        df = datatable_to_dataframe(datatable)
        # print(df.head(n=2))

        csv_output_file = os.path.join(self.tables_folder, filename + '.csv')
        print("saving extracted data to '%s'" % csv_output_file)
        df.to_csv(csv_output_file, index=False, header=False)
Example #3
0
# http://theautomatic.net/2019/05/24/3-ways-to-scrape-tables-from-pdfs-with-python/
# https://www.thepythoncode.com/article/extract-pdf-tables-in-python-camelot
# https://www.java.com/en/download/help/windows_manual_download.html

import tabula
tables = tabula.read_pdf(file, pages = "all", multiple_tables = True)
print(tables)

exit()

# Read tables in PDF (Camelot)
# http://theautomatic.net/2019/05/24/3-ways-to-scrape-tables-from-pdfs-with-python/
# https://www.thepythoncode.com/article/extract-pdf-tables-in-python-camelot
import camelot

tables = camelot.read_pdf(file)
print(tables.n)

exit()



pdfFileObj = open(file, 'rb')




# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

# printing number of pages in pdf file
Example #4
0
                "value": main_sum[6]
            },
        ],
    }],
}

# 発生状況リンク取得
href = tag.find("a", text=re.compile("^県内の発生状況")).get("href")
link = pdf_link(urljoin(url, href))

# patients

# PDF読込
tables = camelot.read_pdf(link,
                          pages="all",
                          split_text=True,
                          strip_text="\n",
                          line_scale=40)

df_tmp = pd.concat([table.df for table in tables]).reset_index(drop=True)

df_kanja = df_tmp.T.set_index(0).T

df_kanja.rename(columns={"NO.": "No"}, inplace=True)

df_kanja["No"] = df_kanja["No"].astype(int)
df_kanja.sort_values("No", inplace=True)

df_kanja["公表日"] = df_kanja["判明日"].apply(my_parser)

df_kanja["リリース日"] = df_kanja["公表日"].dt.strftime("%Y-%m-%dT08:00:00.000Z")
Example #5
0
# -*- coding: utf-8 -*-
"""
Created on Fri Jun  7 09:38:29 2019

@author: ALarger

Cal Pesticide Residues 2001
"""

import camelot
import pandas as pd

chemName = []

tables = (camelot.read_pdf(
    r'L:\Lab\HEM\ALarger\Actor Automated Extraction\California\Cal Pesticide Residues\document_1359549.pdf',
    pages='2296-2754',
    flavor='stream'))

for table in tables:
    df = table.df
    chemName.extend(df.iloc[:, 0])

m = len(chemName)
while m > 0:  #Go backwards through chemical name list so that the indexing does not get messed up when one is deleted
    m -= 1
    chemName[m] = chemName[m].strip()
    if chemName[m] == '' or chemName[m] == 'NO RESIDUE FOUND' or chemName[
            m] == 'CHEMNAME':
        del chemName[m]

nIngredients = len(chemName)
Example #6
0
def test_lattice_process_background():
    df = pd.DataFrame(data_lattice_process_background)

    filename = os.path.join(testdir, "background_lines_1.pdf")
    tables = camelot.read_pdf(filename, process_background=True)
    assert df.equals(tables[1].df)
Example #7
0
def test_arabic():
    df = pd.DataFrame(data_arabic)

    filename = os.path.join(testdir, "tabula/arabic.pdf")
    tables = camelot.read_pdf(filename)
    assert df.equals(tables[0].df)
Example #8
0
def parse(url):
    tables = camelot.read_pdf(url, pages="all")
    surv, chro, dist = 0, [], []
    for table in tables:
        if len(table.df.columns) in [2, 3]:
            dist.append(table.df)
        if len(table.df.columns) in [4, 5]:
            if table.df[len(table.df.columns) - 1][0] == "Remarks":
                chro.append(table.df)
            if table.df[0][0] == "District":
                surv = table.df
    data = init_data()
    num, rem, dis = "", "", ""

    # manual fixes
    # for bule_25032020.pdf
    if "bule_25032020" in url:
        chro[0][2][4] = "Pathanamthitta – 4\nKottayam – 2 \nErnakulam -2"
    # for bule_20032020.pdf
    if "bule_20032020" in url:
        chro[0][1][9] = "Thiruvananthapuram -3"
        chro[0][1][10] = "Thiruvananthapuram -1"

    # Parses table: Chronology of Positive cases
    i = 1
    if "patient" in chro[0][0][0]:
        i = 0

    def dis_parse(s):
        return list(
            map(
                re.compile("[-–]").split,
                re.sub(
                    "[\(\[].*?[\)\]]",
                    "",
                    s.replace(" ", "").replace(",", ""),
                    flags=re.DOTALL,
                ).splitlines(),
            ))

    def add(n):
        data[t[0]]["corona_positive"] += n
        if "Negative" in rem:
            data[t[0]]["cured_discharged"] += n
        if "Expired" in status:
            data[t[0]]["deaths"] += n

    for ch in chro:
        for row in ch.iterrows():
            if "persons have been" in row[1][0]:
                continue
            if "patient" in row[1][i]:
                continue
            if row[1][i].isnumeric():
                num = int(row[1][i])
            if row[1][i + 1]:
                dis = dis_parse(row[1][i + 1])
            else:
                continue
            if row[1][i + 2]:
                rem = row[1][i + 2]
            status = row[1][i + 3]
            if len(dis) > 1:
                if len(dis[0]) > 1:
                    for t in dis:
                        t[0] = check_alt(t[0])
                        add(int(t[1]))
                else:
                    for t in dis:
                        t[0] = check_alt(t[0])
                        add(1)
            else:
                if len(dis[0]) > 1:
                    for t in dis:
                        t[0] = check_alt(t[0])
                        add(int(t[1]))

                else:
                    t = dis[0]
                    t[0] = check_alt(t[0])
                    inc = 1
                    if row[1][i]:
                        inc = int(num)
                    add(inc)

    # Parses table: Details of persons under Surveillance
    for row in surv.iterrows():
        if any(x in row[1][0] for x in ["District", "Total"]):
            continue
        t = check_alt(row[1][0].strip())
        data[t]["under_observation"] += int(row[1][1])
        data[t]["under_home_isolation"] += int(row[1][2])
        data[t]["total_hospitalised"] += int(row[1][3])
        data[t]["hospitalised_today"] += int(row[1][4])

    # Parses table: District wise distribution based on hospital admission
    for di in dist:
        for row in di.iterrows():
            if any(x in row[1][0] for x in ["District", "Total"]):
                continue
            t = check_alt(row[1][0].strip())
            data[t]["positive_admitted"] += int(row[1][1])

    data = {
        "kerala": data,
        "time": datetime.now().isoformat(),
        "file_url": url,
    }
    return json.dumps(data)
@author: ALarger
"""

import camelot, re, csv
import pandas as pd

chemName = []
templateName = []
funcUse = []
prodID = []
chem = ''

#Fruits and Veggies
tables = camelot.read_pdf(
    r'L:\Lab\HEM\ALarger\Actor Automated Extraction\USDA\Pesticide Annual Summary 2002\document_1363480.pdf',
    pages='52-94',
    flavor='stream')
for table in tables:
    df = table.df
    for index, row in df.iterrows():
        if row[1] == '' and row[0] != '':
            chem = row[0].split('(')[0].strip()
            use = ', '.join(re.findall('\(.*?\)',
                                       row[0])).replace('(',
                                                        '').replace(')', '')
        elif row[2] != '0' and row[2] != '' and any(
                c not in '1234567890 '
                for c in row[2]) == False and 'TOTAL' not in row[0]:
            commodity = row[0].split('(')[0].strip().replace(
                '/', ' and ')  #Can't have / in filename
            chemName.append(chem)
chemName = []
casN = []
prodID = []
templateName = []
msdsDate = []
recUse = []
catCode = []
descrip = []
code = []
sourceType = []
propName = []
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))

tables = camelot.read_pdf(
    r'L:\Lab\HEM\ALarger\Actor Automated Extraction\FDA\CDER Drug and Biologic Approvals 2010\CDER New Drug Approvals 2010.pdf',
    pages='all',
    flavor='lattice')
i = 0
for table in tables:
    df = tables[i].df
    if i == (len(tables) - 1):
        chemName.extend(df.iloc[:, 2])
        casN.extend([''] * len(df))
        propName.extend(df.iloc[:, 1])
        prodID.extend(['1372142'] * len(df))
        templateName.extend(['CDER Biologic License Approvals 2010.pdf'] *
                            len(df))
        msdsDate.extend([2010] * len(df))
        recUse.extend([''] * len(df))
        catCode.extend([''] * len(df))
        descrip.extend([''] * len(df))
import camelot
import pandas as pd

chemName = []
casN = []
prodID = []
templateName = []
msdsDate = []
recUse = []
catCode = []
descrip = []
code = []
sourceType = []

tables = (camelot.read_pdf(
    r'L:/Lab/HEM/ALarger/Actor Automated Extraction/Arizona/(2003) Pesticide Contamination Prevention Program Report A.R.S. 49-303.B/document_320432.pdf',
    pages='3-18',
    flavor='lattice'))
i = 0
for table in tables:
    df = tables[i].df
    if i <= 6:  #Table 2
        df = df.drop(df.index[0])
        df = df.drop(df.index[0])
        chemName.extend(df.loc[:, 1])
        casN.extend(df.loc[:, 0])
        prodID.extend(['1372080'] * len(df))
        templateName.extend(['Arizona 2003 Pesticide Report Table 2.pdf'] *
                            len(df))
        msdsDate.extend([2003] * len(df))
        recUse.extend([''] * len(df))
        catCode.extend([''] * len(df))
Example #12
0
    def pdf_to_table(self, path):

        tables = camelot.read_pdf(path, pages="all")

        return tables
Example #13
0
import camelot

tables = camelot.read_pdf("Sample SLDC file.pdf")
first_table = tables[0].df
print(first_table)
def read_pdf(file_path):
    tables = camelot.read_pdf(file_path, pages="1-end")
    for table in tables:
        yield table
Example #15
0
def test_stream_edge_tol():
    df = pd.DataFrame(data_stream_edge_tol)

    filename = os.path.join(testdir, "edge_tol.pdf")
    tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
    assert df.equals(tables[0].df)
Example #16
0
def main():
    pdf_temp = '/tmp/covid.tmp.pdf'
    header = ('caso', 'estado', 'sexo', 'edad', 'sintomas', 'rt-pcr',
              'procedencia', 'llegada')

    if len(sys.argv) == 2:
        url = sys.argv[1]
        tld = extract(url)

        if tld.suffix not in ['gob.mx']:
            sys.stdout.write(
                'Error: %sInvalid URL. URL should be from gob.mx domain%s.\n' %
                (RED, RESET))
            sys.exit(1)

        try:
            result = search("([0-9]{4}\.[0-9]{2}\.[0-9]{2})", url)
            report_date = datetime.strptime(result[0], '%Y.%m.%d')
        except:
            sys.stdout.write(
                'Error: %sInvalid format for report date URL. Format should contain date YYYY.MM.DD%s .\n'
                % (RED, RESET))
            sys.exit(1)

        if 'sospechosos' in url:
            report_type = 's'
        elif 'positivos' in url:
            report_type = 'c'

        json_filename = 'json/%s-%02d-%02d-%s.json' % (
            report_date.year, report_date.month, report_date.day, report_type)

        sys.stdout.write('Downloading PDF from: %s \n' % url)

        try:
            file = wget.download(url, pdf_temp)
        except:
            sys.stdout.write('%sError%s\n' % (RED, RESET))
            sys.exit(1)

        # Overwrite file if already exists
        if os.path.exists(pdf_temp):
            shutil.move(file, pdf_temp)

        sys.stdout.write(
            ' [%sDONE%s]\nPerforming PDF conversion (this may take a while): '
            % (GREEN, RESET))
        start = time.time()

        try:
            tables = camelot.read_pdf(pdf_temp, parallel=True, pages='all')
        except:
            sys.stdout.write(
                '%sInvalid format. Downloaded file should be a PDF%s.\n' %
                (RED, RESET))
            sys.exit(1)

        end = time.time()
        hours, rem = divmod(end - start, 3600)
        minutes, seconds = divmod(rem, 60)
        sys.stdout.write(' %02dh %02dm %02.2fs [%sDONE%s]\n' %
                         (int(hours), int(minutes), seconds, GREEN, RESET))

        json_file = open(json_filename, 'w')
        json_file.write('{"datos":[')
        first_line = True
        first_row = True
        counter = 1
        sys.stdout.write('Creating JSON file: ')

        for table in tables:
            for row in table.data:
                if first_line is False:
                    if first_row is False:
                        json_file.write(',\n')

                    data = dict(zip(header, row))

                    # Clean Strings
                    data['estado'] = data['estado'].replace('*', '')
                    data['estado'] = data['estado'].replace('\n', '')
                    sys.stdout.write('%s.%s' % (BOLD, RESET))
                    json.dump(data, json_file, ensure_ascii=False, indent=3)
                    first_row = False

                first_line = False
                counter += 1

        json_file.write(']}')
        sys.stdout.write(' [%sDONE%s]\n' % (GREEN, RESET))
        json_file.close()

        # JSON validation
        sys.stdout.write('Validating JSON: ')
        json_file = open(json_filename, 'r')
        json_data = json_file.read()

        try:
            json.loads(json_data)
            sys.stdout.write('%sfile is valid%s.\n' % (GREEN, RESET))
            sys.stdout.write('\n')
            sys.stdout.write('File %s%s%s created successfully.\n' %
                             (BOLD, json_filename, RESET))
            sys.stdout.write('%s%02d%s cases processed from %s%d%s pages.\n' %
                             (BOLD, counter, RESET, BOLD, len(tables), RESET))
        except ValueError as error:
            sys.stdout.write(
                '\033[1;31mJSON is invalid, check for errors\033[0;0m.\n')

        json_file.close()
        sys.stdout.write('\n')
        sys.exit(0)
    else:
        sys.stdout.write('%sError: report URL is required%s.\n' % (RED, RESET))
        sys.stdout.write('Usage: %spython pdf-importer.py <URL>%s.\n' %
                         (CYAN, RESET))
        sys.exit(1)
Example #17
0
def test_lattice_table_areas():
    df = pd.DataFrame(data_lattice_table_areas)

    filename = os.path.join(testdir, "twotables_2.pdf")
    tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
    assert df.equals(tables[0].df)
Example #18
0
    def parse_pdf(file_path):
        tables = Wrangler.__define_table_configs()
        date_regex = re.compile(r"(\d{1,2}\/){2}\d{4}")
        notes_regex = re.compile(r"notes:?", flags=re.IGNORECASE)

        print("Processing PDF...")
        parsed_tables = camelot.read_pdf(filepath=file_path, pages='1-end')
        for i, tbl in enumerate(parsed_tables):
            print(f"\nParsing page {i+1}.")
            df = tbl.df
            # this following line is mainly for debugging purposes to see if table start/end matches index
            # it will export these csv files to the project root directory
            # df.to_csv(f"parsed-table-{i}.csv")
            column_a = df.iloc[:, 0]

            cache = {'last': None, 'history': []}
            for i, cell in column_a.items():
                cell = cell.strip().lower()

                # basic logic for parsing one of these tables is as follows...
                # step 1: find a matching table name, save to cache['last']
                # step 2: find the next date cell, save start row as i
                # step 3: find the 'Notes:' cell, save end row as i-1 (blank row)
                # step 4: empty cache['last'], return to step 1

                if cache['last']:
                    if tables[cache['last']].start:
                        if notes_regex.search(cell):
                            print(f"  > last row for \'{cache['last']}\': {i}")
                            tables[cache['last']].end = i-1
                            cache['history'].append(cache['last'])
                            cache['last'] = None
                        else:
                            continue
                    else:
                        if date_regex.search(cell):
                            print(f"  > first row for \'{cache['last']}\': {i}")
                            tables[cache['last']].start = i
                        else:
                            continue
                else:
                    if cell in tables:
                        print(f"Found \'{cell}\' table!")
                        cache['last'] = cell
                    else:
                        continue

            # now cache['history'] is a list of str table names that we parsed from this tbl
            print("\nBuilding DataFrames...")
            for name in cache['history']:
                start_row = tables[name].start
                end_row = tables[name].end
                print(f"  > \'{name}\' rows: {start_row} to {end_row}")

                tables[name].df = df.iloc[start_row:end_row, :].copy()
                t_df = tables[name].df
                t_df.replace(to_replace=r"^\s*$", value=np.nan, regex=True, inplace=True)
                t_df.dropna(axis=0, how='all', inplace=True)
                t_df.dropna(axis=1, how='all', inplace=True)
                t_df.replace(to_replace=r"\n", value=" ", regex=True, inplace=True)

                num_parsed_cols = t_df.shape[1]
                expected_cols = tables[name].cols
                if num_parsed_cols == len(expected_cols):
                    t_df.columns = expected_cols
                else:
                    # TODO: if it doesn't match, raise a warning and skip it for now? (ask Roberto)
                    print(f"WARNING: Column length mismatch! # of parsed cols: {num_parsed_cols}, # of expected cols: {len(expected_cols)}")
            continue

        print("\nFinished parsing!")
        for table in tables:
            print(f"\n\'{table}\' DataFrame:\n{tables[table].df.head()}")

        return tables
Example #19
0
def test_lattice_copy_text():
    df = pd.DataFrame(data_lattice_copy_text)

    filename = os.path.join(testdir, "row_span_1.pdf")
    tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
    assert df.equals(tables[0].df)
import numpy as np
import os
import pandas as pd
import PyPDF4
import re
import io
import time

os.chdir(r"C:\Users\jdavi\Downloads")

# Tables in didigtal document

# Is crutial to specify the page in which the table is in
# Doo loop over each page to ensure the parsing of table
tables = camelot.read_pdf('C_PROCESO_20-12-10643602_205031011_72747497.pdf',
                          pages='1,2',
                          flavor='lattice')  # flavor='lattice'

tables1 = camelot.read_pdf('C_PROCESO_20-12-10658075_205142011_72882086.pdf',
                           pages='2')

# tables = camelot.read_pdf('C_PROCESO_20-12-10643602_205031011_72747497.pdf', pages='2')

tables[0].df
tables1[1].df

tables.export('secop_test1.csv', f='csv', compress=True)

# Tables in scanned document

import tabula  # Works better pip install tabula-py
Example #21
0
def test_stream():
    df = pd.DataFrame(data_stream)

    filename = os.path.join(testdir, "health.pdf")
    tables = camelot.read_pdf(filename, flavor="stream")
    assert df.equals(tables[0].df)
Example #22
0
def getPDF():
    baseUrl = 'https://www.winthrop.edu/uploadedFiles/Police/DailyCaseLog-'
    today = date.today()
    formatted_date = today.strftime("%B%Y")
    extension = ".pdf"
    monthext = formatted_date + extension
    url = baseUrl + monthext
    with open('/tmp/pol.pdf', 'wb') as f:
        f.write(requests.get(url).content)
    pdfFile = '/tmp/pol.pdf'
    return pdfFile


file = getPDF()
tables = camelot.read_pdf('/tmp/pol.pdf', pages='all', strip_text=' .\n')
for i in range(len(tables)):
    dateTime = tables[i].df.loc[4, 0]
    dateTimeList = dateTime.partition('T')
    time = dateTimeList[1] + dateTimeList[2]
    synopsis = tables[i].df.loc[10, 0].split(":")[1]
    print(synopsis)
    #if(dateTimeList[i].split(":")[1]!=''):
    #    print(dateTimeList[0])
    #    print("")
    #    print(time)
    #    print("")
    #    synopsis = tables[i].df.loc[10,0]
    #    print(synopsis.split(": ")[1])
    #    print("")
Example #23
0
import camelot
import re
import datetime
import matplotlib.pyplot as plt
from deltaCalculator import DeltaCalculator

deltaCalculator = DeltaCalculator(True)
startPid = input("Enter start page number:")
endPid = input("Enter end page number:")
pages = ""

for i in range(int(startPid), int(endPid) + 1):
	pages = pages + "," + str(i) if len(pages) != 0 else str(i)
print(pages)

tables = camelot.read_pdf('.tmp/ka.pdf',strip_text='\n', pages=pages, split_text = True)

for index, table in enumerate(tables):
	tables[index].to_csv('.tmp/ka' + str(index) + '.csv')

kaOutputFile = open('kafull.csv', 'w') 
for index, table in enumerate(tables):
	kaFile = open('.tmp/ka' + str(index) + '.csv', 'r') 
	lines = kaFile.readlines()

	for line in lines:
		line = line.replace('\"', '')
		linesArray = line.split(',')
		if len(linesArray[7]) == 0:
			continue
import camelot

foo = camelot.read_pdf('SchuldnerAtlas_Deutschland_2019_Kreise_Alphabet.pdf',pages='1,2,3,4,5,6,7,8,9,10,11')
foo.export('foo.csv', f='csv') 
Example #25
0
import os
import re
import ntpath
import camelot
import pandas as pd
from glob import glob
import functions as fun

(USR, OVW) = ('srv', False)
(PATH_I, PATH_O ) = fun.selectPaths(USR)
# Cycle through the PDFs -------------------------------------------------------
fPaths = glob(PATH_I+'*.pdf')
(fNum, fNames) = (len(fPaths), fun.stripPaths(fPaths))
print('- Storing files to {}'.format(PATH_O))
for (i, fPath) in enumerate(fPaths):
    print('\t* ({}/{}) Processing {}'.format(i+1, fNum, fPath))
    # Check if the file already exists -----------------------------------------
    outFPath = PATH_O+fNames[i]+'.csv'
    outFExists = os.path.exists(outFPath)
    # Load file and parse ------------------------------------------------------
    if (not outFExists or OVW):
        # Cleanup the table ----------------------------------------------------
        tables = camelot.read_pdf(fPath, pages='1-end')
        dfs = [fun.cleanupDF(table) for table in tables]
        # Merge dataframes -----------------------------------------------------
        dfPre = pd.concat(dfs).dropna()
        dfPst = dfPre.applymap(fun.cleanCell)
        dfPst.to_csv(outFPath)
print('- Finished!')
Example #26
0
def test_stream_flag_size():
    df = pd.DataFrame(data_stream_flag_size)

    filename = os.path.join(testdir, "superscript.pdf")
    tables = camelot.read_pdf(filename, flavor="stream", flag_size=True)
    assert df.equals(tables[0].df)
Example #27
0
def parse(input, pdf):
    print('---Parsing detected tables----')
    tables = camelot.read_pdf(pdf, pages=','.join(map(str, input)))
    tables.export('output.html', f='html', compress=True)
    #tables[0].to_csv('output.csv')
    print('---RAR File Generated---')
Example #28
0
def test_stream_strip_text():
    df = pd.DataFrame(data_stream_strip_text)

    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n")
    assert df.equals(tables[0].df)
def pdf_to_pandas_clear(path):
    '''
    Parameters
    ----------
    path : complete path of the folder when your pdfs are stored
            exemple: "C:\\Users\\exemple\\Downloads"
    Returns
    -------
    notas : pandas dataframe with columns 'Negociação', 
    'Compra/Venda', 'Tipo de Mercado', 'Epecificação do título', 'Quantidade', 'Preço', 'Valor',
    'Débito/Crédito', 'Data' and 'Código' well formated
    '''

    arquivos_path = listdir(path)

    notas_path = []
    for i in range(len(arquivos_path)):
        if 'Nota' in arquivos_path[i]:
            notas_path.append(arquivos_path[i])

    notas = pd.DataFrame()
    for i in notas_path:
        path_pdf = r'{}\{}'.format(path, i)

        #Using the lib camelot to select tables in a pdf
        tables = camelot.read_pdf(
            path_pdf,
            flavor='stream',
            table_areas=['0,600,600,400'],
            columns=['91,105,167,180,305,345,402,445,543'])

        ######## IF ERROR "EOF marker not found":
        #That's a strange error that usually happens with Clear pdf's
        #It's caused because the pdf had an error in the making of it.
        #####SOLVING ######
        #I normally open the pdf file and make a little edit:
        #For example, I open the pdf using Microsoft Edge, because there it is possible
        #To write with your mouse. So I make a little dot in the top of the pdf.
        #That way, the pdf is saved properly, with a good format (with EOF marker)

        #Selecting the second table
        notas_de_corretagem = tables[0].df

        #Making the first row the header
        notas_de_corretagem.columns = notas_de_corretagem.loc[0]

        #Excluding the first row (now it is the header) and reseting the index
        notas_de_corretagem2 = notas_de_corretagem.drop(
            [0], axis=0).reset_index(drop=True)

        #Fixing the columns
        notas_de_corretagem2.columns = [
            'C/V', 'Tipo de Mercado', 'Especificação do título', 'TICKER',
            'Quantidade', 'Preço', 'Valor', 'Débito/Crédito'
        ]

        notas_de_corretagem3 = notas_de_corretagem2.drop(
            ['Prazo', 'Obs'], axis=1).reset_index(drop=True)

        # Fixing date
        tables1 = camelot.read_pdf(path_pdf, flavor='stream')
        df_date = tables1[0].df
        date_wrong = df_date.iloc[2, 2]
        year = int(date_wrong[-4:])
        month = int(date_wrong[-7:-5])
        day = int(date_wrong[-10:-8])
        date = datetime.datetime(year, month, day)

        #Adding date to the main table
        notas_de_corretagem4 = notas_de_corretagem3.assign(
            Data=[date] * len(notas_de_corretagem2))

        #transforming 'V' to 'Venda' and 'C' to 'Compra'
        x = []
        for i in np.arange(len(notas_de_corretagem4)):
            y = notas_de_corretagem4.loc[notas_de_corretagem4.index[i],
                                         'Compra/Venda']
            if y == 'C':
                w = 'Compra'
            elif y == 'V':
                w = 'Venda'
            else:
                w = 'Error'
            x = np.append(x, w)
        notas_de_corretagem4['Compra/Venda'] = x

        #transforming 'D' to 'Débito' and 'C' to 'Crédito'
        x = []
        for i in np.arange(len(notas_de_corretagem4)):
            y = notas_de_corretagem4.loc[notas_de_corretagem4.index[i],
                                         'Débito/Crédito']
            if y == 'D':
                w = 'Débito'
            elif y == 'C':
                w = 'Crédito'
            else:
                w = 'Error'
            x = np.append(x, w)
        notas_de_corretagem4['Débito/Crédito'] = x

        #Converting 'Preço / Ajuste' into a numerical value (float)
        x = []
        for i in np.arange(len(notas_de_corretagem4)):
            y = notas_de_corretagem4.loc[notas_de_corretagem4.index[i],
                                         'Preço']
            # Here we substitute the decimal ',' with '.' and the milliard '.' with ''
            local = y.rfind(',')
            z = y[:local] + '.' + y[local + 1:]
            if len(z) > 6:
                local2 = z.find('.')
                t = float(z[:local2] + '' + z[local2 + 1:])
            else:
                t = float(z)
            x = np.append(x, t)
        notas_de_corretagem4['Preço'] = x

        #Converting 'Quantidade' into a numerical value (int)
        x = []
        for i in np.arange(len(notas_de_corretagem4)):
            y = int(notas_de_corretagem4.loc[notas_de_corretagem4.index[i],
                                             'Quantidade'])
            x = np.append(x, y)
        notas_de_corretagem4['Quantidade'] = x

        #Calculating 'Valor / Ajuste' as numerical value (float)
        x = []
        for i in np.arange(len(notas_de_corretagem4)):
            y = np.round(
                notas_de_corretagem4.loc[notas_de_corretagem4.index[i],
                                         'Preço'] *
                notas_de_corretagem4.loc[notas_de_corretagem4.index[i],
                                         'Quantidade'], 2)
            x = np.append(x, y)
        notas_de_corretagem4['Valor'] = x

        #Coverting the enterprise name ('Especificação do título') to the stock code ('Código')

        #Selecting only the main name of the good (ticker)

        on_pn = 0
        nome = 0
        codigo = []
        tipo = []
        for i in np.arange(len(notas_de_corretagem4)):
            y = notas_de_corretagem4.loc[notas_de_corretagem4.index[i],
                                         'Especificação do título']
            w = y.split('          ')
            if len(w) == 3:
                w = w[:-1]

            nome = w[0]
            on_pn = w[1]

            cod = ''
            tipo_papel = ''
            for j in np.arange(len(Lista_empresas)):
                if nome == Lista_empresas['Nome de Pregão'][j]:
                    cod = str(Lista_empresas['Código'][j])[:4]
                    if "ON" in on_pn:
                        cod = f'{cod}{3}'
                        tipo_papel = 'Ação'
                    elif "PNA" in on_pn:
                        cod = f'{cod}{5}'
                        tipo_papel = 'Ação'
                    elif "PNB" in on_pn:
                        cod = f'{cod}{6}'
                        tipo_papel = 'Ação'
                    elif "PNC" in on_pn:
                        cod = f'{cod}{7}'
                        tipo_papel = 'Ação'
                    elif "PND" in on_pn:
                        cod = f'{cod}{8}'
                        tipo_papel = 'Ação'
                    elif "UNT " in on_pn:
                        cod = f'{cod}{11}'
                        tipo_papel = 'Ação'
                    elif "PN" in on_pn:
                        cod = f'{cod}{4}'
                        tipo_papel = 'Ação'
                    elif "CI" in on_pn:
                        cod = f'{cod}{11}'
                        tipo_papel = 'ETF'

            if cod == '':
                cod = on_pn
                tipo_papel = 'FII'
            codigo.append(cod)
            tipo.append(tipo_papel)

        notas_de_corretagem4['Código'] = codigo
        notas_de_corretagem4['Tipo Papel'] = tipo

        notas = pd.concat([notas, notas_de_corretagem4]).reset_index(drop=True)

    return notas
from pdfminer.pdfinterp import resolve1

file_path = Path(Path.cwd(), "image.pdf")

file = open(str(file_path), 'rb')
parser = PDFParser(file)
document = PDFDocument(parser)

        
        

tables_list = []


    
tables = camelot.read_pdf(str(file_path), pages="all")


for i in range(0,1):
    print (i)
    tables = camelot.read_pdf("image.pdf", pages='%d' %  i)
    try:
        print (tabulate(tables[0].df))
        print (tabulate(tables[1].df))
    except IndexError:
        print('None')
        
        

tables.export('image.csv', f='csv', compress=False)    
tables[0].parsing_report