def main(): for pdf in os.listdir(): file_name, file_extension = os.path.splitext(pdf) if file_extension == '.pdf': cmd = "pdfgrep -Pn '^(?s:(?=.*Revenue)|(?=.*Income))' " + pdf + " | awk -F\":\" '$0~\":\"{print $1}' | tr '\n' ','" pages = subprocess.check_output(cmd, shell=True).decode("utf-8") print(pdf) tables = camelot.read_pdf(pdf, flavor='stream', pages=pages, edge_tol=100) filtered = [] for index, table in enumerate(tables): whitespace = tables[index].parsing_report.get('whitespace') if whitespace <= 25: filtered.append(tables[index]) filtered_tables = TableList(filtered) filtered_tables.export('test.xlsx', f='excel', compress=True)
def table_list(gp_response): """ Convert Camelot GoPro's response to Camelot's TableList type object :param gp_response: Response received from Camelot GoPro :return: Camelot's TableList like object with extra Attributes ---------- Regular Camelot's TableList object Pages: int Total number of input pages in pdf or 1 if image JobStatus : str ('Success'|'Failed'|'Processing'|'Incomplete') Job status of finding tables """ prev_page = 0 order = 1 if gp_response["JobStatus"].lower().startswith("succe"): pass elif gp_response["JobStatus"].lower().startswith("process"): print("-=- "*15) print(f'[Info]: Table Extraction process is {gp_response["JobStatus"]}') print("Check more info using '__dict__' descriptor on the result object.") print("Use the 'JobId' from the response, to check and retrieve the output when the 'Processing' is 'Success'") print("JobId is:", gp_response["JobId"]) print("Follow the last step in link: " "https://github.com/ExtractTable/camelotpro/blob/master/how%20to%20code.ipynb") print("-=- "*15) elif gp_response["JobStatus"].lower().startswith("fail"): print("[Info]: Table Extraction is Failed. Complete Response Below") for k, v in gp_response.items(): print(f"{k}: {v}") elif not any([gp_response["JobStatus"].lower().startswith("succe"), gp_response.get("Tables", [])]): print("[Info]: Table Extraction is not completed. Status:", gp_response["JobStatus"]) print("Check more info using '__dict__' descriptor on the result object.\n") for each in gp_response.get("Tables", []): if each["Page"] == prev_page: order += 1 else: prev_page = each["Page"] order = 1 each["Order"] = order tmp_tbl_list = TableList([Table(gp_table) for gp_table in gp_response.get("Tables", [])]) tmp_tbl_list.Pages = gp_response.pop("Pages", "NA") for k, v in gp_response.items(): if k != "Tables": tmp_tbl_list.__setattr__(k, v) return tmp_tbl_list
def test_table_order(): def _make_table(page, order): t = Table([], []) t.page = page t.order = order return t table_list = TableList([ _make_table(2, 1), _make_table(1, 1), _make_table(3, 4), _make_table(1, 2) ]) assert [(t.page, t.order) for t in sorted(table_list)] == [ (1, 1), (1, 2), (2, 1), (3, 4), ] assert [(t.page, t.order) for t in sorted(table_list, reverse=True)] == [ (3, 4), (2, 1), (1, 2), (1, 1), ]
def extract(job_id): try: session = Session() job = session.query(Job).filter(Job.job_id == job_id).first() rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first() file = session.query(File).filter(File.file_id == job.file_id).first() rule_options = json.loads(rule.rule_options) flavor = rule_options.pop('flavor') pages = rule_options.pop('pages') tables = [] filepaths = json.loads(file.filepaths) for p in pages: kwargs = pages[p] kwargs.update(rule_options) parser = Lattice( **kwargs) if flavor.lower() == 'lattice' else Stream(**kwargs) t = parser.extract_tables(filepaths[p]) for _t in t: _t.page = int(p) tables.extend(t) tables = TableList(tables) froot, fext = os.path.splitext(file.filename) datapath = os.path.dirname(file.filepath) for f in ['csv', 'excel', 'json', 'html']: f_datapath = os.path.join(datapath, f) mkdirs(f_datapath) ext = f if f != 'excel' else 'xlsx' f_datapath = os.path.join(f_datapath, '{}.{}'.format(froot, ext)) tables.export(f_datapath, f=f, compress=True) # for render jsonpath = os.path.join(datapath, 'json') jsonpath = os.path.join(jsonpath, '{}.json'.format(froot)) tables.export(jsonpath, f='json') render_files = { os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(datapath, 'json/*.json')) } job.datapath = datapath job.render_files = json.dumps(render_files) job.is_finished = True job.finished_at = dt.datetime.now() session.commit() session.close() except Exception as e: logging.exception(e)
def extract(job_id): try: session = Session() job = session.query(Job).filter(Job.job_id == job_id).first() rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first() file = session.query(File).filter(File.file_id == job.file_id).first() rule_options = json.loads(rule.rule_options) flavor = rule_options.pop("flavor") pages = rule_options.pop("pages") tables = [] filepaths = json.loads(file.filepaths) for p in pages: kwargs = pages[p] kwargs.update(rule_options) parser = (Lattice( **kwargs) if flavor.lower() == "lattice" else Stream(**kwargs)) t = parser.extract_tables(filepaths[p]) for _t in t: _t.page = int(p) tables.extend(t) tables = TableList(tables) froot, fext = os.path.splitext(file.filename) datapath = os.path.dirname(file.filepath) for f in ["csv", "excel", "json", "html"]: f_datapath = os.path.join(datapath, f) mkdirs(f_datapath) ext = f if f != "excel" else "xlsx" f_datapath = os.path.join(f_datapath, f"{froot}.{ext}") tables.export(f_datapath, f=f, compress=True) # for render jsonpath = os.path.join(datapath, "json") jsonpath = os.path.join(jsonpath, f"{froot}.json") tables.export(jsonpath, f="json") render_files = { os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(datapath, "json/*.json")) } job.datapath = datapath job.render_files = json.dumps(render_files) job.is_finished = True job.finished_at = dt.datetime.now() session.commit() session.close() except Exception as e: logging.exception(e)
def extract(job_id): try: session = Session() job = session.query(Job).filter(Job.job_id == job_id).first() rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first() file = session.query(File).filter(File.file_id == job.file_id).first() parent_folder = os.path.join(conf.PDFS_FOLDER, file.file_id, '') docs = os.listdir(parent_folder) docs = sorted( list( map(lambda x: os.path.join(parent_folder, x), filter(lambda x: x[0:4] == "file", docs)))) rule_options = json.loads(rule.rule_options) flavor = rule_options.pop('flavor') pages = rule_options.pop('pages') filepaths = json.loads(file.filepaths) filepaths_as_list = list(filepaths.values()) tables = [] i = 0 for doc in docs: for f in filepaths_as_list: os.remove(f) gs_call = 'gs -q -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -dSAFER -o {}page-%d.pdf {}'.format( parent_folder, doc) gs_call = gs_call.encode().split() null = open(os.devnull, 'wb') with Ghostscript(*gs_call, stdout=null) as gs: pass null.close() for p in pages: kwargs = pages[p] kwargs.update(rule_options) parser = Lattice( **kwargs) if flavor.lower() == 'lattice' else Stream( **kwargs) t = parser.extract_tables(filepaths[p]) for _t in t: _t.page = int(p) + i tables.extend(t) i += len(pages) tables = TableList(tables) froot, fext = os.path.splitext(file.filename) datapath = os.path.dirname(file.filepath) for f in ['csv', 'excel', 'json', 'html']: f_datapath = os.path.join(datapath, f) mkdirs(f_datapath) ext = f if f != 'excel' else 'xlsx' f_datapath = os.path.join(f_datapath, '{}.{}'.format(froot, ext)) tables.export(f_datapath, f=f, compress=True) # for render jsonpath = os.path.join(datapath, 'json') jsonpath = os.path.join(jsonpath, '{}.json'.format(froot)) tables.export(jsonpath, f='json') render_files = { os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(datapath, 'json/*.json')) } job.datapath = datapath job.render_files = json.dumps(render_files) job.is_finished = True job.finished_at = dt.datetime.now() session.commit() session.close() except Exception as e: logging.exception(e)
def extract(job_id): # noqa try: session = Session() job = session.query(Job).filter(Job.job_id == job_id).first() rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first() file = session.query(File).filter(File.file_id == job.file_id).first() rule_options = json.loads(rule.rule_options) flavor = rule_options.pop("flavor") pages = rule_options.pop("pages") tables = [] filepaths = json.loads(file.filepaths) for p in pages: if p not in filepaths: continue if flavor.lower() == "lattice": kwargs = pages[p] parser = Lattice(**kwargs) t = parser.extract_tables(filepaths[p]) for _t in t: _t.page = int(p) tables.extend(t) else: opts = pages[p] areas, columns = ( opts.get("table_areas", None), opts.get("columns", None), ) if areas and columns: page_order = 1 for area, column in zip(areas, columns): bbox = ([ round(v, 2) for v in map(float, area.split(",")) ] if area else []) cols = list(map(float, column.split(","))) if column else [] split_text = rule_options.get("split_text", False) if cols and bbox: abs_cols = [round(c + bbox[0], 2) for c in cols] table_region = bbox table_area = ",".join(map(str, bbox)) table_columns = ",".join(map(str, abs_cols)) if len(abs_cols) > 4 and split_text: pass # split_text = False elif bbox: table_region = bbox table_area = ",".join(map(str, bbox)) table_columns = None split_text = False else: table_region = None table_area = None table_columns = None kwargs = dict( table_regions=[table_region] if table_region else None, table_areas=[table_area] if table_area else None, columns=[table_columns] if table_columns else None, row_tol=rule_options.get("row_close_tol", 2), column_tol=rule_options.get("col_close_tol", 0), edge_tol=rule_options.get("edge_close_tol", 50), flag_size=rule_options.get("flag_size", False), split_text=split_text, strip_text=rule_options.get("strip_text", ""), ) print(f"Using Stream({kwargs!r})") parser = Stream(**kwargs) t = parser.extract_tables(filepaths[p]) print(f"Result: {t}") for _t in t: _t.page = int(p) _t.order = page_order print( f"Table {_t.order}, Page {_t.page}: {_t.parsing_report}" ) if _t.df.shape == (1, 2): _t.df = _t.df.T elif _t.shape == (1, 1): _t.df = pd.concat( [ _t.df[0], _t.df.replace( {0: { _t.df.iat[0, 0]: "" }})[0], ], axis=0, ignore_index=True, ) if len(_t.df.shape) < 2: _t.df = _t.df.to_frame() if _t.df.shape[1] < 4: _t.df = (_t.df.replace({ "": pd.np.nan }).dropna(how="all").fillna("")) print(_t.df) page_order += 1 tables.extend(t) else: continue tables = TableList(tables) froot, fext = os.path.splitext(file.filename) datapath = os.path.dirname(file.filepath) for f in ["csv", "excel", "json", "html"]: f_datapath = os.path.join(datapath, f) for dirname, dirs, files in os.walk(datapath): for of in files: if of.endswith(("." + f, ".zip", ".xlsx")): fp = os.path.join(dirname, of) os.remove(fp) try: os.removedirs(f_datapath) except FileNotFoundError: pass for f in ["csv", "excel", "json", "html"]: f_datapath = os.path.join(datapath, f) mkdirs(f_datapath) ext = f if f != "excel" else "xlsx" f_datapath = os.path.join(f_datapath, "{}.{}".format(froot, ext)) print(f"Exporting as {f} to {f_datapath}") tables.export(f_datapath, f=f, compress=True) # for render jsonpath = os.path.join(datapath, "json") jsonpath = os.path.join(jsonpath, "{}.json".format(froot)) tables.export(jsonpath, f="json") render_files = { os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(datapath, "json/*.json")) } job.datapath = datapath job.render_files = json.dumps(render_files) job.is_finished = True job.finished_at = dt.datetime.now() session.commit() session.close() except Exception as e: logging.exception(e)