def get_salary_from_pdf(file_path: str): return sum( convert_to_int(salary, file_path) for salary in camelot.read_pdf(file_path)[0].df[7][1:])
def do_tablextract(self, g, pdf_path, p_num): # g is globals print('Starting tablextract') camelot_method = 'lattice' #stream/lattice if self.pdf_type == 'normal': print(pdf_path, p_num) if 'tabula' in g.text_pdf_method: tables = read_pdf( pdf_path, pages=[p_num], multiple_tables=True, java_options= '-Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider') for i in range(len(tables)): table_file_path = '%s/%s-%s' % (self.tables_folder_tabula, p_num, i) # tables[i].fillna('').to_html('%s.html' % (table_file_path)) try: tables[i].fillna('').to_csv('%s.csv' % (table_file_path), encoding='utf-8') except: tables[i].fillna('').to_csv('%s.csv' % (table_file_path), encoding='cp1252') if 'camelot' in g.text_pdf_method: tables = camelot.read_pdf(pdf_path, flavor=camelot_method, pages=str(p_num)) for i in range(len(tables)): # print(tables[0].parsing_report) table_file_path = '%s/%s-%s.csv' % (self.tables_folder_camelot, p_num, i) tables.export(table_file_path, f='csv', compress=False) else: if self.doc_type == 'image': # trying camelot print('Doing camelot-stream') camelot_method = 'stream' #stream/lattice tables = camelot.read_pdf(pdf_path, flavor=camelot_method, pages=str(p_num)) for i in range(len(tables)): # print(tables[0].parsing_report) table_file_path = '%s/%s-%s.csv' % (self.tables_folder_camelot, p_num, i) tables.export(table_file_path, f='csv', compress=False) # Trying pdftabextract filename = os.path.basename(pdf_path).split('.')[0].split('/')[0] DATAPATH = self.images_folder # 'data/' INPUT_XML = '%s/%s.xml' % (self.images_folder, filename) os.system("pdftohtml -c -hidden -xml -enc UTF-8 -f %s -l %s %s %s" % (p_num, p_num, pdf_path, INPUT_XML)) # os.system("pdftohtml -c -hidden -f %s -l %s %s %s/%s.html" % (p_num, p_num, pdf_path, self.html_folder, filename)) # Load the XML that was generated with pdftohtml xmltree, xmlroot = read_xml(INPUT_XML) # parse it and generate a dict of pages pages = parse_pages(xmlroot) # print(pages[p_num]['texts'][0]) p = pages[p_num] # Detecting lines if self.doc_type == 'image': imgfilebasename = '%s-%s_1' % (filename, p_num) imgfile = self.file_path elif self.doc_type == 'pdf': try: imgfilebasename = '%s-%s_1' % (filename, p_num) imgfile = '%s/%s-%s_1.png' % (DATAPATH, filename, p_num) except: imgfilebasename = filename + str(p_num) imgfile = '%s/%s-%s_1.png' % (DATAPATH, filename, p_num) print("\npage %d: detecting lines in image file '%s'..." % (p_num, imgfile)) # create an image processing object with the scanned page iproc_obj = imgproc.ImageProc(imgfile) # calculate the scaling of the image file in relation to the text boxes coordinate system dimensions page_scaling_x = iproc_obj.img_w / p['width'] # scaling in X-direction page_scaling_y = iproc_obj.img_h / p[ 'height'] # scaling in Y-direction # detect the lines lines_hough = iproc_obj.detect_lines(canny_kernel_size=3, canny_low_thresh=50, canny_high_thresh=150, hough_rho_res=1, hough_theta_res=np.pi / 500, hough_votes_thresh=round( 0.2 * iproc_obj.img_w)) print("> found %d lines" % len(lines_hough)) # helper function to save an image def save_image_w_lines(iproc_obj, imgfilebasename): img_lines = iproc_obj.draw_lines(orig_img_as_background=True) img_lines_file = os.path.join( self.temp_folder, '%s-lines-orig.png' % imgfilebasename) print("> saving image with detected lines to '%s'" % img_lines_file) cv2.imwrite(img_lines_file, img_lines) save_image_w_lines(iproc_obj, imgfilebasename) # find rotation or skew # the parameters are: # 1. the minimum threshold in radians for a rotation to be counted as such # 2. the maximum threshold for the difference between horizontal and vertical line rotation (to detect skew) # 3. an optional threshold to filter out "stray" lines whose angle is too far apart from the median angle of # all other lines that go in the same direction (no effect here) rot_or_skew_type, rot_or_skew_radians = iproc_obj.find_rotation_or_skew( radians(0.5), # uses "lines_hough" radians(1), omit_on_rot_thresh=radians(0.5)) # rotate back or deskew text boxes needs_fix = True if rot_or_skew_type == ROTATION: print("> rotating back by %f°" % -degrees(rot_or_skew_radians)) rotate_textboxes(p, -rot_or_skew_radians, pt(0, 0)) elif rot_or_skew_type in (SKEW_X, SKEW_Y): print("> deskewing in direction '%s' by %f°" % (rot_or_skew_type, -degrees(rot_or_skew_radians))) deskew_textboxes(p, -rot_or_skew_radians, rot_or_skew_type, pt(0, 0)) else: needs_fix = False print("> no page rotation / skew found") if needs_fix: # rotate back or deskew detected lines lines_hough = iproc_obj.apply_found_rotation_or_skew( rot_or_skew_type, -rot_or_skew_radians) save_image_w_lines(iproc_obj, imgfilebasename + '-repaired') # save repaired XML (i.e. XML with deskewed textbox positions) repaired_xmlfile = os.path.join(self.temp_folder, filename + '.repaired.xml') print("saving repaired XML file to '%s'..." % repaired_xmlfile) xmltree.write(repaired_xmlfile) # Clustering vertical lines # cluster the detected *vertical* lines using find_clusters_1d_break_dist as simple clustering function # (break on distance MIN_COL_WIDTH/2) # additionally, remove all cluster sections that are considered empty # a cluster is considered empty when the number of text boxes in it is below 10% of the median number of text boxes # per cluster section MIN_COL_WIDTH = g.MIN_COL_WIDTH # minimum width of a column in pixels, measured in the scanned pages vertical_clusters = iproc_obj.find_clusters( imgproc.DIRECTION_VERTICAL, find_clusters_1d_break_dist, remove_empty_cluster_sections_use_texts=p[ 'texts'], # use this page's textboxes remove_empty_cluster_sections_n_texts_ratio=0.1, # 10% rule remove_empty_cluster_sections_scaling= page_scaling_x, # the positions are in "scanned image space" -> we scale them to "text box space" dist_thresh=MIN_COL_WIDTH / 2) print("> found %d clusters" % len(vertical_clusters)) # draw the clusters img_w_clusters = iproc_obj.draw_line_clusters( imgproc.DIRECTION_VERTICAL, vertical_clusters) save_img_file = os.path.join( self.temp_folder, '%s-vertical-clusters.png' % imgfilebasename) print("> saving image with detected vertical clusters to '%s'" % save_img_file) cv2.imwrite(save_img_file, img_w_clusters) # Clustering horizontal lines # cluster the detected *horizontal* lines using find_clusters_1d_break_dist as simple clustering function # (break on distance MIN_ROW_WIDTH/2) # additionally, remove all cluster sections that are considered empty # a cluster is considered empty when the number of text boxes in it is below 10% of the median number of text boxes # per cluster section MIN_ROW_WIDTH = g.MIN_ROW_WIDTH # minimum width of a row in pixels, measured in the scanned pages horizontal_clusters = iproc_obj.find_clusters( imgproc.DIRECTION_HORIZONTAL, find_clusters_1d_break_dist, remove_empty_cluster_sections_use_texts=p[ 'texts'], # use this page's textboxes remove_empty_cluster_sections_n_texts_ratio=0.1, # 10% rule remove_empty_cluster_sections_scaling= page_scaling_y, # the positions are in "scanned image space" -> we scale them to "text box space" dist_thresh=MIN_ROW_WIDTH / 2) print("> found %d clusters" % len(horizontal_clusters)) # draw the clusters img_w_clusters_hoz = iproc_obj.draw_line_clusters( imgproc.DIRECTION_HORIZONTAL, horizontal_clusters) save_img_file = os.path.join( self.temp_folder, '%s-horizontal-clusters.png' % imgfilebasename) print("> saving image with detected vertical clusters to '%s'" % save_img_file) cv2.imwrite(save_img_file, img_w_clusters_hoz) page_colpos = np.array( calc_cluster_centers_1d(vertical_clusters)) / page_scaling_x print('found %d column borders:' % len(page_colpos)) print(page_colpos) page_rowpos = np.array( calc_cluster_centers_1d(horizontal_clusters)) / page_scaling_y print('found %d row borders:' % len(page_rowpos)) print(page_rowpos) # right border of the second column col2_rightborder = page_colpos[2] # calculate median text box height median_text_height = np.median([t['height'] for t in p['texts']]) # get all texts in the first two columns with a "usual" textbox height # we will only use these text boxes in order to determine the line positions because they are more "stable" # otherwise, especially the right side of the column header can lead to problems detecting the first table row text_height_deviation_thresh = median_text_height / 2 texts_cols_1_2 = [ t for t in p['texts'] if t['right'] <= col2_rightborder and abs(t['height'] - median_text_height) <= text_height_deviation_thresh ] # get all textboxes' top and bottom border positions borders_y = border_positions_from_texts(texts_cols_1_2, DIRECTION_VERTICAL) # break into clusters using half of the median text height as break distance clusters_y = find_clusters_1d_break_dist( borders_y, dist_thresh=median_text_height / 2) clusters_w_vals = zip_clusters_and_values(clusters_y, borders_y) # for each cluster, calculate the median as center pos_y = calc_cluster_centers_1d(clusters_w_vals) pos_y.append(p['height']) print('number of line positions:', len(pos_y)) pttrn_table_row_beginning = re.compile( r'^[\d Oo][\d Oo]{2,} +[A-ZÄÖÜ]') # 1. try to find the top row of the table texts_cols_1_2_per_line = split_texts_by_positions( texts_cols_1_2, pos_y, DIRECTION_VERTICAL, alignment='middle', enrich_with_positions=True) # go through the texts line per line for line_texts, (line_top, line_bottom) in texts_cols_1_2_per_line: line_str = join_texts(line_texts) if pttrn_table_row_beginning.match( line_str ): # check if the line content matches the given pattern top_y = line_top break else: top_y = 0 print('Top_y: %s' % top_y) # hints for a footer text box words_in_footer = ('anzeige', 'annahme', 'ala') # 2. try to find the bottom row of the table min_footer_text_height = median_text_height * 1.5 min_footer_y_pos = p['height'] * 0.7 # get all texts in the lower 30% of the page that have are at least 50% bigger than the median textbox height bottom_texts = [ t for t in p['texts'] if t['top'] >= min_footer_y_pos and t['height'] >= min_footer_text_height ] bottom_texts_per_line = split_texts_by_positions( bottom_texts, pos_y + [p['height']], # always down to the end of the page DIRECTION_VERTICAL, alignment='middle', enrich_with_positions=True) # go through the texts at the bottom line per line page_span = page_colpos[-1] - page_colpos[0] min_footer_text_width = page_span * 0.8 for line_texts, (line_top, line_bottom) in bottom_texts_per_line: line_str = join_texts(line_texts) has_wide_footer_text = any(t['width'] >= min_footer_text_width for t in line_texts) # check if there's at least one wide text or if all of the required words for a footer match if has_wide_footer_text or all_a_in_b(words_in_footer, line_str): bottom_y = line_top break else: bottom_y = p['height'] print(bottom_y) print(pos_y) # finally filter the line positions so that only the lines between the table top and bottom are left print(page_rowpos) print("> page %d: %d lines between [%f, %f]" % (p_num, len(page_rowpos), top_y, bottom_y)) def subsequent_pairs(l): """ Return subsequent pairs of values in a list <l>, i.e. [(x1, x2), (x2, x3), (x3, x4), .. (xn-1, xn)] for a list [x1 .. xn] """ return [(l[i - 1], v) for i, v in enumerate(l) if i > 0] # page_rowpos = [y for y in pos_y if top_y <= y <= bottom_y] print(page_colpos, page_rowpos) grid = make_grid_from_positions(page_colpos, page_rowpos) # print(grid) n_rows = len(grid) n_cols = len(grid[0]) print("> page %d: grid with %d rows, %d columns" % (p_num, n_rows, n_cols)) page_grids_file = os.path.join(self.temp_folder, filename + '_pagegrids.json') print("saving page grids JSON file to '%s'" % page_grids_file) save_page_grids({p_num: grid}, page_grids_file) datatable = fit_texts_into_grid(p['texts'], grid) df = datatable_to_dataframe(datatable) # print(df.head(n=2)) csv_output_file = os.path.join(self.tables_folder, filename + '.csv') print("saving extracted data to '%s'" % csv_output_file) df.to_csv(csv_output_file, index=False, header=False)
# http://theautomatic.net/2019/05/24/3-ways-to-scrape-tables-from-pdfs-with-python/ # https://www.thepythoncode.com/article/extract-pdf-tables-in-python-camelot # https://www.java.com/en/download/help/windows_manual_download.html import tabula tables = tabula.read_pdf(file, pages = "all", multiple_tables = True) print(tables) exit() # Read tables in PDF (Camelot) # http://theautomatic.net/2019/05/24/3-ways-to-scrape-tables-from-pdfs-with-python/ # https://www.thepythoncode.com/article/extract-pdf-tables-in-python-camelot import camelot tables = camelot.read_pdf(file) print(tables.n) exit() pdfFileObj = open(file, 'rb') # creating a pdf reader object pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # printing number of pages in pdf file
"value": main_sum[6] }, ], }], } # 発生状況リンク取得 href = tag.find("a", text=re.compile("^県内の発生状況")).get("href") link = pdf_link(urljoin(url, href)) # patients # PDF読込 tables = camelot.read_pdf(link, pages="all", split_text=True, strip_text="\n", line_scale=40) df_tmp = pd.concat([table.df for table in tables]).reset_index(drop=True) df_kanja = df_tmp.T.set_index(0).T df_kanja.rename(columns={"NO.": "No"}, inplace=True) df_kanja["No"] = df_kanja["No"].astype(int) df_kanja.sort_values("No", inplace=True) df_kanja["公表日"] = df_kanja["判明日"].apply(my_parser) df_kanja["リリース日"] = df_kanja["公表日"].dt.strftime("%Y-%m-%dT08:00:00.000Z")
# -*- coding: utf-8 -*- """ Created on Fri Jun 7 09:38:29 2019 @author: ALarger Cal Pesticide Residues 2001 """ import camelot import pandas as pd chemName = [] tables = (camelot.read_pdf( r'L:\Lab\HEM\ALarger\Actor Automated Extraction\California\Cal Pesticide Residues\document_1359549.pdf', pages='2296-2754', flavor='stream')) for table in tables: df = table.df chemName.extend(df.iloc[:, 0]) m = len(chemName) while m > 0: #Go backwards through chemical name list so that the indexing does not get messed up when one is deleted m -= 1 chemName[m] = chemName[m].strip() if chemName[m] == '' or chemName[m] == 'NO RESIDUE FOUND' or chemName[ m] == 'CHEMNAME': del chemName[m] nIngredients = len(chemName)
def test_lattice_process_background(): df = pd.DataFrame(data_lattice_process_background) filename = os.path.join(testdir, "background_lines_1.pdf") tables = camelot.read_pdf(filename, process_background=True) assert df.equals(tables[1].df)
def test_arabic(): df = pd.DataFrame(data_arabic) filename = os.path.join(testdir, "tabula/arabic.pdf") tables = camelot.read_pdf(filename) assert df.equals(tables[0].df)
def parse(url): tables = camelot.read_pdf(url, pages="all") surv, chro, dist = 0, [], [] for table in tables: if len(table.df.columns) in [2, 3]: dist.append(table.df) if len(table.df.columns) in [4, 5]: if table.df[len(table.df.columns) - 1][0] == "Remarks": chro.append(table.df) if table.df[0][0] == "District": surv = table.df data = init_data() num, rem, dis = "", "", "" # manual fixes # for bule_25032020.pdf if "bule_25032020" in url: chro[0][2][4] = "Pathanamthitta – 4\nKottayam – 2 \nErnakulam -2" # for bule_20032020.pdf if "bule_20032020" in url: chro[0][1][9] = "Thiruvananthapuram -3" chro[0][1][10] = "Thiruvananthapuram -1" # Parses table: Chronology of Positive cases i = 1 if "patient" in chro[0][0][0]: i = 0 def dis_parse(s): return list( map( re.compile("[-–]").split, re.sub( "[\(\[].*?[\)\]]", "", s.replace(" ", "").replace(",", ""), flags=re.DOTALL, ).splitlines(), )) def add(n): data[t[0]]["corona_positive"] += n if "Negative" in rem: data[t[0]]["cured_discharged"] += n if "Expired" in status: data[t[0]]["deaths"] += n for ch in chro: for row in ch.iterrows(): if "persons have been" in row[1][0]: continue if "patient" in row[1][i]: continue if row[1][i].isnumeric(): num = int(row[1][i]) if row[1][i + 1]: dis = dis_parse(row[1][i + 1]) else: continue if row[1][i + 2]: rem = row[1][i + 2] status = row[1][i + 3] if len(dis) > 1: if len(dis[0]) > 1: for t in dis: t[0] = check_alt(t[0]) add(int(t[1])) else: for t in dis: t[0] = check_alt(t[0]) add(1) else: if len(dis[0]) > 1: for t in dis: t[0] = check_alt(t[0]) add(int(t[1])) else: t = dis[0] t[0] = check_alt(t[0]) inc = 1 if row[1][i]: inc = int(num) add(inc) # Parses table: Details of persons under Surveillance for row in surv.iterrows(): if any(x in row[1][0] for x in ["District", "Total"]): continue t = check_alt(row[1][0].strip()) data[t]["under_observation"] += int(row[1][1]) data[t]["under_home_isolation"] += int(row[1][2]) data[t]["total_hospitalised"] += int(row[1][3]) data[t]["hospitalised_today"] += int(row[1][4]) # Parses table: District wise distribution based on hospital admission for di in dist: for row in di.iterrows(): if any(x in row[1][0] for x in ["District", "Total"]): continue t = check_alt(row[1][0].strip()) data[t]["positive_admitted"] += int(row[1][1]) data = { "kerala": data, "time": datetime.now().isoformat(), "file_url": url, } return json.dumps(data)
@author: ALarger """ import camelot, re, csv import pandas as pd chemName = [] templateName = [] funcUse = [] prodID = [] chem = '' #Fruits and Veggies tables = camelot.read_pdf( r'L:\Lab\HEM\ALarger\Actor Automated Extraction\USDA\Pesticide Annual Summary 2002\document_1363480.pdf', pages='52-94', flavor='stream') for table in tables: df = table.df for index, row in df.iterrows(): if row[1] == '' and row[0] != '': chem = row[0].split('(')[0].strip() use = ', '.join(re.findall('\(.*?\)', row[0])).replace('(', '').replace(')', '') elif row[2] != '0' and row[2] != '' and any( c not in '1234567890 ' for c in row[2]) == False and 'TOTAL' not in row[0]: commodity = row[0].split('(')[0].strip().replace( '/', ' and ') #Can't have / in filename chemName.append(chem)
chemName = [] casN = [] prodID = [] templateName = [] msdsDate = [] recUse = [] catCode = [] descrip = [] code = [] sourceType = [] propName = [] clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty)) tables = camelot.read_pdf( r'L:\Lab\HEM\ALarger\Actor Automated Extraction\FDA\CDER Drug and Biologic Approvals 2010\CDER New Drug Approvals 2010.pdf', pages='all', flavor='lattice') i = 0 for table in tables: df = tables[i].df if i == (len(tables) - 1): chemName.extend(df.iloc[:, 2]) casN.extend([''] * len(df)) propName.extend(df.iloc[:, 1]) prodID.extend(['1372142'] * len(df)) templateName.extend(['CDER Biologic License Approvals 2010.pdf'] * len(df)) msdsDate.extend([2010] * len(df)) recUse.extend([''] * len(df)) catCode.extend([''] * len(df)) descrip.extend([''] * len(df))
import camelot import pandas as pd chemName = [] casN = [] prodID = [] templateName = [] msdsDate = [] recUse = [] catCode = [] descrip = [] code = [] sourceType = [] tables = (camelot.read_pdf( r'L:/Lab/HEM/ALarger/Actor Automated Extraction/Arizona/(2003) Pesticide Contamination Prevention Program Report A.R.S. 49-303.B/document_320432.pdf', pages='3-18', flavor='lattice')) i = 0 for table in tables: df = tables[i].df if i <= 6: #Table 2 df = df.drop(df.index[0]) df = df.drop(df.index[0]) chemName.extend(df.loc[:, 1]) casN.extend(df.loc[:, 0]) prodID.extend(['1372080'] * len(df)) templateName.extend(['Arizona 2003 Pesticide Report Table 2.pdf'] * len(df)) msdsDate.extend([2003] * len(df)) recUse.extend([''] * len(df)) catCode.extend([''] * len(df))
def pdf_to_table(self, path): tables = camelot.read_pdf(path, pages="all") return tables
import camelot tables = camelot.read_pdf("Sample SLDC file.pdf") first_table = tables[0].df print(first_table)
def read_pdf(file_path): tables = camelot.read_pdf(file_path, pages="1-end") for table in tables: yield table
def test_stream_edge_tol(): df = pd.DataFrame(data_stream_edge_tol) filename = os.path.join(testdir, "edge_tol.pdf") tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500) assert df.equals(tables[0].df)
def main(): pdf_temp = '/tmp/covid.tmp.pdf' header = ('caso', 'estado', 'sexo', 'edad', 'sintomas', 'rt-pcr', 'procedencia', 'llegada') if len(sys.argv) == 2: url = sys.argv[1] tld = extract(url) if tld.suffix not in ['gob.mx']: sys.stdout.write( 'Error: %sInvalid URL. URL should be from gob.mx domain%s.\n' % (RED, RESET)) sys.exit(1) try: result = search("([0-9]{4}\.[0-9]{2}\.[0-9]{2})", url) report_date = datetime.strptime(result[0], '%Y.%m.%d') except: sys.stdout.write( 'Error: %sInvalid format for report date URL. Format should contain date YYYY.MM.DD%s .\n' % (RED, RESET)) sys.exit(1) if 'sospechosos' in url: report_type = 's' elif 'positivos' in url: report_type = 'c' json_filename = 'json/%s-%02d-%02d-%s.json' % ( report_date.year, report_date.month, report_date.day, report_type) sys.stdout.write('Downloading PDF from: %s \n' % url) try: file = wget.download(url, pdf_temp) except: sys.stdout.write('%sError%s\n' % (RED, RESET)) sys.exit(1) # Overwrite file if already exists if os.path.exists(pdf_temp): shutil.move(file, pdf_temp) sys.stdout.write( ' [%sDONE%s]\nPerforming PDF conversion (this may take a while): ' % (GREEN, RESET)) start = time.time() try: tables = camelot.read_pdf(pdf_temp, parallel=True, pages='all') except: sys.stdout.write( '%sInvalid format. Downloaded file should be a PDF%s.\n' % (RED, RESET)) sys.exit(1) end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) sys.stdout.write(' %02dh %02dm %02.2fs [%sDONE%s]\n' % (int(hours), int(minutes), seconds, GREEN, RESET)) json_file = open(json_filename, 'w') json_file.write('{"datos":[') first_line = True first_row = True counter = 1 sys.stdout.write('Creating JSON file: ') for table in tables: for row in table.data: if first_line is False: if first_row is False: json_file.write(',\n') data = dict(zip(header, row)) # Clean Strings data['estado'] = data['estado'].replace('*', '') data['estado'] = data['estado'].replace('\n', '') sys.stdout.write('%s.%s' % (BOLD, RESET)) json.dump(data, json_file, ensure_ascii=False, indent=3) first_row = False first_line = False counter += 1 json_file.write(']}') sys.stdout.write(' [%sDONE%s]\n' % (GREEN, RESET)) json_file.close() # JSON validation sys.stdout.write('Validating JSON: ') json_file = open(json_filename, 'r') json_data = json_file.read() try: json.loads(json_data) sys.stdout.write('%sfile is valid%s.\n' % (GREEN, RESET)) sys.stdout.write('\n') sys.stdout.write('File %s%s%s created successfully.\n' % (BOLD, json_filename, RESET)) sys.stdout.write('%s%02d%s cases processed from %s%d%s pages.\n' % (BOLD, counter, RESET, BOLD, len(tables), RESET)) except ValueError as error: sys.stdout.write( '\033[1;31mJSON is invalid, check for errors\033[0;0m.\n') json_file.close() sys.stdout.write('\n') sys.exit(0) else: sys.stdout.write('%sError: report URL is required%s.\n' % (RED, RESET)) sys.stdout.write('Usage: %spython pdf-importer.py <URL>%s.\n' % (CYAN, RESET)) sys.exit(1)
def test_lattice_table_areas(): df = pd.DataFrame(data_lattice_table_areas) filename = os.path.join(testdir, "twotables_2.pdf") tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"]) assert df.equals(tables[0].df)
def parse_pdf(file_path): tables = Wrangler.__define_table_configs() date_regex = re.compile(r"(\d{1,2}\/){2}\d{4}") notes_regex = re.compile(r"notes:?", flags=re.IGNORECASE) print("Processing PDF...") parsed_tables = camelot.read_pdf(filepath=file_path, pages='1-end') for i, tbl in enumerate(parsed_tables): print(f"\nParsing page {i+1}.") df = tbl.df # this following line is mainly for debugging purposes to see if table start/end matches index # it will export these csv files to the project root directory # df.to_csv(f"parsed-table-{i}.csv") column_a = df.iloc[:, 0] cache = {'last': None, 'history': []} for i, cell in column_a.items(): cell = cell.strip().lower() # basic logic for parsing one of these tables is as follows... # step 1: find a matching table name, save to cache['last'] # step 2: find the next date cell, save start row as i # step 3: find the 'Notes:' cell, save end row as i-1 (blank row) # step 4: empty cache['last'], return to step 1 if cache['last']: if tables[cache['last']].start: if notes_regex.search(cell): print(f" > last row for \'{cache['last']}\': {i}") tables[cache['last']].end = i-1 cache['history'].append(cache['last']) cache['last'] = None else: continue else: if date_regex.search(cell): print(f" > first row for \'{cache['last']}\': {i}") tables[cache['last']].start = i else: continue else: if cell in tables: print(f"Found \'{cell}\' table!") cache['last'] = cell else: continue # now cache['history'] is a list of str table names that we parsed from this tbl print("\nBuilding DataFrames...") for name in cache['history']: start_row = tables[name].start end_row = tables[name].end print(f" > \'{name}\' rows: {start_row} to {end_row}") tables[name].df = df.iloc[start_row:end_row, :].copy() t_df = tables[name].df t_df.replace(to_replace=r"^\s*$", value=np.nan, regex=True, inplace=True) t_df.dropna(axis=0, how='all', inplace=True) t_df.dropna(axis=1, how='all', inplace=True) t_df.replace(to_replace=r"\n", value=" ", regex=True, inplace=True) num_parsed_cols = t_df.shape[1] expected_cols = tables[name].cols if num_parsed_cols == len(expected_cols): t_df.columns = expected_cols else: # TODO: if it doesn't match, raise a warning and skip it for now? (ask Roberto) print(f"WARNING: Column length mismatch! # of parsed cols: {num_parsed_cols}, # of expected cols: {len(expected_cols)}") continue print("\nFinished parsing!") for table in tables: print(f"\n\'{table}\' DataFrame:\n{tables[table].df.head()}") return tables
def test_lattice_copy_text(): df = pd.DataFrame(data_lattice_copy_text) filename = os.path.join(testdir, "row_span_1.pdf") tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v") assert df.equals(tables[0].df)
import numpy as np import os import pandas as pd import PyPDF4 import re import io import time os.chdir(r"C:\Users\jdavi\Downloads") # Tables in didigtal document # Is crutial to specify the page in which the table is in # Doo loop over each page to ensure the parsing of table tables = camelot.read_pdf('C_PROCESO_20-12-10643602_205031011_72747497.pdf', pages='1,2', flavor='lattice') # flavor='lattice' tables1 = camelot.read_pdf('C_PROCESO_20-12-10658075_205142011_72882086.pdf', pages='2') # tables = camelot.read_pdf('C_PROCESO_20-12-10643602_205031011_72747497.pdf', pages='2') tables[0].df tables1[1].df tables.export('secop_test1.csv', f='csv', compress=True) # Tables in scanned document import tabula # Works better pip install tabula-py
def test_stream(): df = pd.DataFrame(data_stream) filename = os.path.join(testdir, "health.pdf") tables = camelot.read_pdf(filename, flavor="stream") assert df.equals(tables[0].df)
def getPDF(): baseUrl = 'https://www.winthrop.edu/uploadedFiles/Police/DailyCaseLog-' today = date.today() formatted_date = today.strftime("%B%Y") extension = ".pdf" monthext = formatted_date + extension url = baseUrl + monthext with open('/tmp/pol.pdf', 'wb') as f: f.write(requests.get(url).content) pdfFile = '/tmp/pol.pdf' return pdfFile file = getPDF() tables = camelot.read_pdf('/tmp/pol.pdf', pages='all', strip_text=' .\n') for i in range(len(tables)): dateTime = tables[i].df.loc[4, 0] dateTimeList = dateTime.partition('T') time = dateTimeList[1] + dateTimeList[2] synopsis = tables[i].df.loc[10, 0].split(":")[1] print(synopsis) #if(dateTimeList[i].split(":")[1]!=''): # print(dateTimeList[0]) # print("") # print(time) # print("") # synopsis = tables[i].df.loc[10,0] # print(synopsis.split(": ")[1]) # print("")
import camelot import re import datetime import matplotlib.pyplot as plt from deltaCalculator import DeltaCalculator deltaCalculator = DeltaCalculator(True) startPid = input("Enter start page number:") endPid = input("Enter end page number:") pages = "" for i in range(int(startPid), int(endPid) + 1): pages = pages + "," + str(i) if len(pages) != 0 else str(i) print(pages) tables = camelot.read_pdf('.tmp/ka.pdf',strip_text='\n', pages=pages, split_text = True) for index, table in enumerate(tables): tables[index].to_csv('.tmp/ka' + str(index) + '.csv') kaOutputFile = open('kafull.csv', 'w') for index, table in enumerate(tables): kaFile = open('.tmp/ka' + str(index) + '.csv', 'r') lines = kaFile.readlines() for line in lines: line = line.replace('\"', '') linesArray = line.split(',') if len(linesArray[7]) == 0: continue
import camelot foo = camelot.read_pdf('SchuldnerAtlas_Deutschland_2019_Kreise_Alphabet.pdf',pages='1,2,3,4,5,6,7,8,9,10,11') foo.export('foo.csv', f='csv')
import os import re import ntpath import camelot import pandas as pd from glob import glob import functions as fun (USR, OVW) = ('srv', False) (PATH_I, PATH_O ) = fun.selectPaths(USR) # Cycle through the PDFs ------------------------------------------------------- fPaths = glob(PATH_I+'*.pdf') (fNum, fNames) = (len(fPaths), fun.stripPaths(fPaths)) print('- Storing files to {}'.format(PATH_O)) for (i, fPath) in enumerate(fPaths): print('\t* ({}/{}) Processing {}'.format(i+1, fNum, fPath)) # Check if the file already exists ----------------------------------------- outFPath = PATH_O+fNames[i]+'.csv' outFExists = os.path.exists(outFPath) # Load file and parse ------------------------------------------------------ if (not outFExists or OVW): # Cleanup the table ---------------------------------------------------- tables = camelot.read_pdf(fPath, pages='1-end') dfs = [fun.cleanupDF(table) for table in tables] # Merge dataframes ----------------------------------------------------- dfPre = pd.concat(dfs).dropna() dfPst = dfPre.applymap(fun.cleanCell) dfPst.to_csv(outFPath) print('- Finished!')
def test_stream_flag_size(): df = pd.DataFrame(data_stream_flag_size) filename = os.path.join(testdir, "superscript.pdf") tables = camelot.read_pdf(filename, flavor="stream", flag_size=True) assert df.equals(tables[0].df)
def parse(input, pdf): print('---Parsing detected tables----') tables = camelot.read_pdf(pdf, pages=','.join(map(str, input))) tables.export('output.html', f='html', compress=True) #tables[0].to_csv('output.csv') print('---RAR File Generated---')
def test_stream_strip_text(): df = pd.DataFrame(data_stream_strip_text) filename = os.path.join(testdir, "detect_vertical_false.pdf") tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n") assert df.equals(tables[0].df)
def pdf_to_pandas_clear(path): ''' Parameters ---------- path : complete path of the folder when your pdfs are stored exemple: "C:\\Users\\exemple\\Downloads" Returns ------- notas : pandas dataframe with columns 'Negociação', 'Compra/Venda', 'Tipo de Mercado', 'Epecificação do título', 'Quantidade', 'Preço', 'Valor', 'Débito/Crédito', 'Data' and 'Código' well formated ''' arquivos_path = listdir(path) notas_path = [] for i in range(len(arquivos_path)): if 'Nota' in arquivos_path[i]: notas_path.append(arquivos_path[i]) notas = pd.DataFrame() for i in notas_path: path_pdf = r'{}\{}'.format(path, i) #Using the lib camelot to select tables in a pdf tables = camelot.read_pdf( path_pdf, flavor='stream', table_areas=['0,600,600,400'], columns=['91,105,167,180,305,345,402,445,543']) ######## IF ERROR "EOF marker not found": #That's a strange error that usually happens with Clear pdf's #It's caused because the pdf had an error in the making of it. #####SOLVING ###### #I normally open the pdf file and make a little edit: #For example, I open the pdf using Microsoft Edge, because there it is possible #To write with your mouse. So I make a little dot in the top of the pdf. #That way, the pdf is saved properly, with a good format (with EOF marker) #Selecting the second table notas_de_corretagem = tables[0].df #Making the first row the header notas_de_corretagem.columns = notas_de_corretagem.loc[0] #Excluding the first row (now it is the header) and reseting the index notas_de_corretagem2 = notas_de_corretagem.drop( [0], axis=0).reset_index(drop=True) #Fixing the columns notas_de_corretagem2.columns = [ 'C/V', 'Tipo de Mercado', 'Especificação do título', 'TICKER', 'Quantidade', 'Preço', 'Valor', 'Débito/Crédito' ] notas_de_corretagem3 = notas_de_corretagem2.drop( ['Prazo', 'Obs'], axis=1).reset_index(drop=True) # Fixing date tables1 = camelot.read_pdf(path_pdf, flavor='stream') df_date = tables1[0].df date_wrong = df_date.iloc[2, 2] year = int(date_wrong[-4:]) month = int(date_wrong[-7:-5]) day = int(date_wrong[-10:-8]) date = datetime.datetime(year, month, day) #Adding date to the main table notas_de_corretagem4 = notas_de_corretagem3.assign( Data=[date] * len(notas_de_corretagem2)) #transforming 'V' to 'Venda' and 'C' to 'Compra' x = [] for i in np.arange(len(notas_de_corretagem4)): y = notas_de_corretagem4.loc[notas_de_corretagem4.index[i], 'Compra/Venda'] if y == 'C': w = 'Compra' elif y == 'V': w = 'Venda' else: w = 'Error' x = np.append(x, w) notas_de_corretagem4['Compra/Venda'] = x #transforming 'D' to 'Débito' and 'C' to 'Crédito' x = [] for i in np.arange(len(notas_de_corretagem4)): y = notas_de_corretagem4.loc[notas_de_corretagem4.index[i], 'Débito/Crédito'] if y == 'D': w = 'Débito' elif y == 'C': w = 'Crédito' else: w = 'Error' x = np.append(x, w) notas_de_corretagem4['Débito/Crédito'] = x #Converting 'Preço / Ajuste' into a numerical value (float) x = [] for i in np.arange(len(notas_de_corretagem4)): y = notas_de_corretagem4.loc[notas_de_corretagem4.index[i], 'Preço'] # Here we substitute the decimal ',' with '.' and the milliard '.' with '' local = y.rfind(',') z = y[:local] + '.' + y[local + 1:] if len(z) > 6: local2 = z.find('.') t = float(z[:local2] + '' + z[local2 + 1:]) else: t = float(z) x = np.append(x, t) notas_de_corretagem4['Preço'] = x #Converting 'Quantidade' into a numerical value (int) x = [] for i in np.arange(len(notas_de_corretagem4)): y = int(notas_de_corretagem4.loc[notas_de_corretagem4.index[i], 'Quantidade']) x = np.append(x, y) notas_de_corretagem4['Quantidade'] = x #Calculating 'Valor / Ajuste' as numerical value (float) x = [] for i in np.arange(len(notas_de_corretagem4)): y = np.round( notas_de_corretagem4.loc[notas_de_corretagem4.index[i], 'Preço'] * notas_de_corretagem4.loc[notas_de_corretagem4.index[i], 'Quantidade'], 2) x = np.append(x, y) notas_de_corretagem4['Valor'] = x #Coverting the enterprise name ('Especificação do título') to the stock code ('Código') #Selecting only the main name of the good (ticker) on_pn = 0 nome = 0 codigo = [] tipo = [] for i in np.arange(len(notas_de_corretagem4)): y = notas_de_corretagem4.loc[notas_de_corretagem4.index[i], 'Especificação do título'] w = y.split(' ') if len(w) == 3: w = w[:-1] nome = w[0] on_pn = w[1] cod = '' tipo_papel = '' for j in np.arange(len(Lista_empresas)): if nome == Lista_empresas['Nome de Pregão'][j]: cod = str(Lista_empresas['Código'][j])[:4] if "ON" in on_pn: cod = f'{cod}{3}' tipo_papel = 'Ação' elif "PNA" in on_pn: cod = f'{cod}{5}' tipo_papel = 'Ação' elif "PNB" in on_pn: cod = f'{cod}{6}' tipo_papel = 'Ação' elif "PNC" in on_pn: cod = f'{cod}{7}' tipo_papel = 'Ação' elif "PND" in on_pn: cod = f'{cod}{8}' tipo_papel = 'Ação' elif "UNT " in on_pn: cod = f'{cod}{11}' tipo_papel = 'Ação' elif "PN" in on_pn: cod = f'{cod}{4}' tipo_papel = 'Ação' elif "CI" in on_pn: cod = f'{cod}{11}' tipo_papel = 'ETF' if cod == '': cod = on_pn tipo_papel = 'FII' codigo.append(cod) tipo.append(tipo_papel) notas_de_corretagem4['Código'] = codigo notas_de_corretagem4['Tipo Papel'] = tipo notas = pd.concat([notas, notas_de_corretagem4]).reset_index(drop=True) return notas
from pdfminer.pdfinterp import resolve1 file_path = Path(Path.cwd(), "image.pdf") file = open(str(file_path), 'rb') parser = PDFParser(file) document = PDFDocument(parser) tables_list = [] tables = camelot.read_pdf(str(file_path), pages="all") for i in range(0,1): print (i) tables = camelot.read_pdf("image.pdf", pages='%d' % i) try: print (tabulate(tables[0].df)) print (tabulate(tables[1].df)) except IndexError: print('None') tables.export('image.csv', f='csv', compress=False) tables[0].parsing_report