def extract(job_id): try: session = Session() job = session.query(Job).filter(Job.job_id == job_id).first() rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first() file = session.query(File).filter(File.file_id == job.file_id).first() rule_options = json.loads(rule.rule_options) flavor = rule_options.pop('flavor') pages = rule_options.pop('pages') tables = [] filepaths = json.loads(file.filepaths) for p in pages: kwargs = pages[p] kwargs.update(rule_options) parser = Lattice( **kwargs) if flavor.lower() == 'lattice' else Stream(**kwargs) t = parser.extract_tables(filepaths[p]) for _t in t: _t.page = int(p) tables.extend(t) tables = TableList(tables) froot, fext = os.path.splitext(file.filename) datapath = os.path.dirname(file.filepath) for f in ['csv', 'excel', 'json', 'html']: f_datapath = os.path.join(datapath, f) mkdirs(f_datapath) ext = f if f != 'excel' else 'xlsx' f_datapath = os.path.join(f_datapath, '{}.{}'.format(froot, ext)) tables.export(f_datapath, f=f, compress=True) # for render jsonpath = os.path.join(datapath, 'json') jsonpath = os.path.join(jsonpath, '{}.json'.format(froot)) tables.export(jsonpath, f='json') render_files = { os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(datapath, 'json/*.json')) } job.datapath = datapath job.render_files = json.dumps(render_files) job.is_finished = True job.finished_at = dt.datetime.now() session.commit() session.close() except Exception as e: logging.exception(e)
def extract(job_id): try: session = Session() job = session.query(Job).filter(Job.job_id == job_id).first() rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first() file = session.query(File).filter(File.file_id == job.file_id).first() rule_options = json.loads(rule.rule_options) flavor = rule_options.pop("flavor") pages = rule_options.pop("pages") tables = [] filepaths = json.loads(file.filepaths) for p in pages: kwargs = pages[p] kwargs.update(rule_options) kwargs = (create_respective_columns(kwargs) if flavor.lower() == "stream" else kwargs) parser = (Lattice( **kwargs) if flavor.lower() == "lattice" else Stream(**kwargs)) t = parser.extract_tables(filepaths[p]) for _t in t: _t.page = int(p) tables.extend(t) tables = core.TableList(tables) froot, fext = os.path.splitext(file.filename) datapath = os.path.dirname(file.filepath) datapath = os.path.join(datapath, job_id) for f in ["csv", "excel", "json", "html"]: f_datapath = os.path.join(datapath, f) mkdirs(f_datapath) ext = f if f != "excel" else "xlsx" f_datapath = os.path.join(f_datapath, f"{froot}.{ext}") tables.export(f_datapath, f=f, compress=True) # for render jsonpath = os.path.join(datapath, "json") jsonpath = os.path.join(jsonpath, f"{froot}.json") tables.export(jsonpath, f="json") render_files = { os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(datapath, "json/*.json")) } job.datapath = datapath job.render_files = json.dumps(render_files) job.is_finished = True job.finished_at = dt.datetime.now() session.commit() session.close() except Exception as e: logging.exception(e)
def split(file_id): try: def get_executable(): import platform from distutils.spawn import find_executable class GhostscriptNotFound(Exception): pass gs = None system = platform.system().lower() try: if system == 'windows': if find_executable('gswin32c.exe'): gs = 'gswin32c.exe' elif find_executable('gswin64c.exe'): gs = 'gswin64c.exe' else: raise ValueError else: if find_executable('gs'): gs = 'gs' elif find_executable('gsc'): gs = 'gsc' else: raise ValueError if 'ghostscript' not in subprocess.check_output( [gs, '-version']).decode('utf-8').lower(): raise ValueError except ValueError: raise GhostscriptNotFound( 'Please make sure that Ghostscript is installed' ' and available on the PATH environment variable') return gs session = Session() file = session.query(File).filter(File.file_id == file_id).first() extract_pages, total_pages = get_pages(file.filepath, file.pages) filenames, filepaths, imagenames, imagepaths, filedims, imagedims, detected_areas = ( {} for i in range(7)) for page in extract_pages: # extract into single-page PDF save_page(file.filepath, page) filename = 'page-{}.pdf'.format(page) filepath = os.path.join(conf.PDFS_FOLDER, file_id, filename) imagename = ''.join([filename.replace('.pdf', ''), '.png']) imagepath = os.path.join(conf.PDFS_FOLDER, file_id, imagename) # convert single-page PDF to PNG gs_call = [ '-q', '-sDEVICE=png16m', '-o', imagepath, '-r600', filepath ] gs = get_executable() gs_call.insert(0, gs) process = subprocess.Popen(gs_call) out = process.communicate()[0] ret = process.wait() filenames[page] = filename filepaths[page] = filepath imagenames[page] = imagename imagepaths[page] = imagepath filedims[page] = get_file_dim(filepath) imagedims[page] = get_image_dim(imagepath) lattice_areas, stream_areas = (None for i in range(2)) # lattice parser = Lattice() tables = parser.extract_tables(filepath) if len(tables): lattice_areas = [] for table in tables: x1, y1, x2, y2 = table._bbox lattice_areas.append((x1, y2, x2, y1)) # stream parser = Stream() tables = parser.extract_tables(filepath) if len(tables): stream_areas = [] for table in tables: x1, y1, x2, y2 = table._bbox stream_areas.append((x1, y2, x2, y1)) detected_areas[page] = { 'lattice': lattice_areas, 'stream': stream_areas } file.extract_pages = json.dumps(extract_pages) file.total_pages = total_pages file.has_image = True file.filenames = json.dumps(filenames) file.filepaths = json.dumps(filepaths) file.imagenames = json.dumps(imagenames) file.imagepaths = json.dumps(imagepaths) file.filedims = json.dumps(filedims) file.imagedims = json.dumps(imagedims) file.detected_areas = json.dumps(detected_areas) session.commit() session.close() except Exception as e: logging.exception(e)
def split(file_id): try: session = Session() file: File = session.query(File).filter( File.file_id == file_id).first() extract_pages, total_pages = get_pages(file.filepath, file.pages) ( filenames, filepaths, imagenames, imagepaths, filedims, imagedims, detected_areas, ) = ({} for i in range(7)) for page in extract_pages: # extract into single-page PDF save_page(file.filepath, page) filename = f"page-{page}.pdf" filepath = os.path.join(conf.PDFS_FOLDER, file_id, filename) imagename = "".join([filename.replace(".pdf", ""), ".png"]) imagepath = os.path.join(conf.PDFS_FOLDER, file_id, imagename) # convert single-page PDF to PNG gs_call = f"-q -sDEVICE=png16m -o {imagepath} -r300 {filepath}" gs_call = gs_call.encode().split() null = open(os.devnull, "wb") with Ghostscript(*gs_call, stdout=null): pass null.close() filenames[page] = filename filepaths[page] = filepath imagenames[page] = imagename imagepaths[page] = imagepath filedims[page] = get_file_dim(filepath) imagedims[page] = get_image_dim(imagepath) lattice_areas, stream_areas = (None for i in range(2)) # lattice parser = Lattice() tables = parser.extract_tables(filepath) if len(tables): lattice_areas = [] for table in tables: x1, y1, x2, y2 = table._bbox lattice_areas.append((x1, y2, x2, y1)) # stream parser = Stream() tables = parser.extract_tables(filepath) if len(tables): stream_areas = [] for table in tables: x1, y1, x2, y2 = table._bbox stream_areas.append((x1, y2, x2, y1)) detected_areas[page] = { "lattice": lattice_areas, "stream": stream_areas } file_is_new = True same_as = None for old_file in session.query(File).filter( File.file_id != file_id, File.filename == file.filename, File.same_as.is_(None), ): file_is_new = file_is_new and iterate_paths(imagepaths, old_file) same_as = same_as if file_is_new else old_file if file_is_new: file.extract_pages = json.dumps(extract_pages) file.total_pages = total_pages file.has_image = True file.filenames = json.dumps(filenames) file.filepaths = json.dumps(filepaths) file.imagenames = json.dumps(imagenames) file.imagepaths = json.dumps(imagepaths) file.filedims = json.dumps(filedims) file.imagedims = json.dumps(imagedims) file.detected_areas = json.dumps(detected_areas) file.same_as = None file.deleted_folder = False session.commit() session.close() publish_new_file_message(file) else: clone_old_file(file, same_as) session.commit() session.close() except Exception as e: logging.exception(e)
def extract(job_id): try: session = Session() job = session.query(Job).filter(Job.job_id == job_id).first() rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first() file = session.query(File).filter(File.file_id == job.file_id).first() parent_folder = os.path.join(conf.PDFS_FOLDER, file.file_id, '') docs = os.listdir(parent_folder) docs = sorted( list( map(lambda x: os.path.join(parent_folder, x), filter(lambda x: x[0:4] == "file", docs)))) rule_options = json.loads(rule.rule_options) flavor = rule_options.pop('flavor') pages = rule_options.pop('pages') filepaths = json.loads(file.filepaths) filepaths_as_list = list(filepaths.values()) tables = [] i = 0 for doc in docs: for f in filepaths_as_list: os.remove(f) gs_call = 'gs -q -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -dSAFER -o {}page-%d.pdf {}'.format( parent_folder, doc) gs_call = gs_call.encode().split() null = open(os.devnull, 'wb') with Ghostscript(*gs_call, stdout=null) as gs: pass null.close() for p in pages: kwargs = pages[p] kwargs.update(rule_options) parser = Lattice( **kwargs) if flavor.lower() == 'lattice' else Stream( **kwargs) t = parser.extract_tables(filepaths[p]) for _t in t: _t.page = int(p) + i tables.extend(t) i += len(pages) tables = TableList(tables) froot, fext = os.path.splitext(file.filename) datapath = os.path.dirname(file.filepath) for f in ['csv', 'excel', 'json', 'html']: f_datapath = os.path.join(datapath, f) mkdirs(f_datapath) ext = f if f != 'excel' else 'xlsx' f_datapath = os.path.join(f_datapath, '{}.{}'.format(froot, ext)) tables.export(f_datapath, f=f, compress=True) # for render jsonpath = os.path.join(datapath, 'json') jsonpath = os.path.join(jsonpath, '{}.json'.format(froot)) tables.export(jsonpath, f='json') render_files = { os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(datapath, 'json/*.json')) } job.datapath = datapath job.render_files = json.dumps(render_files) job.is_finished = True job.finished_at = dt.datetime.now() session.commit() session.close() except Exception as e: logging.exception(e)
def split(file_id): try: session = Session() file = session.query(File).filter(File.file_id == file_id).first() extract_pages, total_pages = get_pages(file.filepath, file.pages) parent_folder = os.path.join(conf.PDFS_FOLDER, file_id, '') # extract into single-page PDFs gs_call = 'gs -q -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -dSAFER -o {}page-%d.pdf {}'.format( parent_folder, file.filepath) gs_call = gs_call.encode().split() null = open(os.devnull, 'wb') with Ghostscript(*gs_call, stdout=null) as gs: pass # PDF to PNG files for each page gs_call = 'gs -q -sDEVICE=png16m -o {}page-%d.png -r300 {}'.format( parent_folder, file.filepath) gs_call = gs_call.encode().split() with Ghostscript(*gs_call, stdout=null) as gs: pass null.close() filenames, filepaths, imagenames, imagepaths, filedims, imagedims, detected_areas = ( {} for i in range(7)) for page in extract_pages: filename = 'page-{}.pdf'.format(page) filepath = os.path.join(conf.PDFS_FOLDER, file_id, filename) imagename = ''.join([filename.replace('.pdf', ''), '.png']) imagepath = os.path.join(conf.PDFS_FOLDER, file_id, imagename) filenames[page] = filename filepaths[page] = filepath imagenames[page] = imagename imagepaths[page] = imagepath filedims[page] = get_file_dim(filepath) imagedims[page] = get_image_dim(imagepath) lattice_areas, stream_areas = (None for i in range(2)) # lattice parser = Lattice() tables = parser.extract_tables(filepath) if len(tables): lattice_areas = [] for table in tables: x1, y1, x2, y2 = table._bbox lattice_areas.append((x1, y2, x2, y1)) # stream parser = Stream() tables = parser.extract_tables(filepath) if len(tables): stream_areas = [] for table in tables: x1, y1, x2, y2 = table._bbox stream_areas.append((x1, y2, x2, y1)) detected_areas[page] = { 'lattice': lattice_areas, 'stream': stream_areas } file.extract_pages = json.dumps(extract_pages) file.total_pages = total_pages file.has_image = True file.filenames = json.dumps(filenames) file.filepaths = json.dumps(filepaths) file.imagenames = json.dumps(imagenames) file.imagepaths = json.dumps(imagepaths) file.filedims = json.dumps(filedims) file.imagedims = json.dumps(imagedims) file.detected_areas = json.dumps(detected_areas) session.commit() session.close() except Exception as e: logging.exception(e)
def extract(job_id): # noqa try: session = Session() job = session.query(Job).filter(Job.job_id == job_id).first() rule = session.query(Rule).filter(Rule.rule_id == job.rule_id).first() file = session.query(File).filter(File.file_id == job.file_id).first() rule_options = json.loads(rule.rule_options) flavor = rule_options.pop("flavor") pages = rule_options.pop("pages") tables = [] filepaths = json.loads(file.filepaths) for p in pages: if p not in filepaths: continue if flavor.lower() == "lattice": kwargs = pages[p] parser = Lattice(**kwargs) t = parser.extract_tables(filepaths[p]) for _t in t: _t.page = int(p) tables.extend(t) else: opts = pages[p] areas, columns = ( opts.get("table_areas", None), opts.get("columns", None), ) if areas and columns: page_order = 1 for area, column in zip(areas, columns): bbox = ([ round(v, 2) for v in map(float, area.split(",")) ] if area else []) cols = list(map(float, column.split(","))) if column else [] split_text = rule_options.get("split_text", False) if cols and bbox: abs_cols = [round(c + bbox[0], 2) for c in cols] table_region = bbox table_area = ",".join(map(str, bbox)) table_columns = ",".join(map(str, abs_cols)) if len(abs_cols) > 4 and split_text: pass # split_text = False elif bbox: table_region = bbox table_area = ",".join(map(str, bbox)) table_columns = None split_text = False else: table_region = None table_area = None table_columns = None kwargs = dict( table_regions=[table_region] if table_region else None, table_areas=[table_area] if table_area else None, columns=[table_columns] if table_columns else None, row_tol=rule_options.get("row_close_tol", 2), column_tol=rule_options.get("col_close_tol", 0), edge_tol=rule_options.get("edge_close_tol", 50), flag_size=rule_options.get("flag_size", False), split_text=split_text, strip_text=rule_options.get("strip_text", ""), ) print(f"Using Stream({kwargs!r})") parser = Stream(**kwargs) t = parser.extract_tables(filepaths[p]) print(f"Result: {t}") for _t in t: _t.page = int(p) _t.order = page_order print( f"Table {_t.order}, Page {_t.page}: {_t.parsing_report}" ) if _t.df.shape == (1, 2): _t.df = _t.df.T elif _t.shape == (1, 1): _t.df = pd.concat( [ _t.df[0], _t.df.replace( {0: { _t.df.iat[0, 0]: "" }})[0], ], axis=0, ignore_index=True, ) if len(_t.df.shape) < 2: _t.df = _t.df.to_frame() if _t.df.shape[1] < 4: _t.df = (_t.df.replace({ "": pd.np.nan }).dropna(how="all").fillna("")) print(_t.df) page_order += 1 tables.extend(t) else: continue tables = TableList(tables) froot, fext = os.path.splitext(file.filename) datapath = os.path.dirname(file.filepath) for f in ["csv", "excel", "json", "html"]: f_datapath = os.path.join(datapath, f) for dirname, dirs, files in os.walk(datapath): for of in files: if of.endswith(("." + f, ".zip", ".xlsx")): fp = os.path.join(dirname, of) os.remove(fp) try: os.removedirs(f_datapath) except FileNotFoundError: pass for f in ["csv", "excel", "json", "html"]: f_datapath = os.path.join(datapath, f) mkdirs(f_datapath) ext = f if f != "excel" else "xlsx" f_datapath = os.path.join(f_datapath, "{}.{}".format(froot, ext)) print(f"Exporting as {f} to {f_datapath}") tables.export(f_datapath, f=f, compress=True) # for render jsonpath = os.path.join(datapath, "json") jsonpath = os.path.join(jsonpath, "{}.json".format(froot)) tables.export(jsonpath, f="json") render_files = { os.path.splitext(os.path.basename(f))[0]: f for f in glob.glob(os.path.join(datapath, "json/*.json")) } job.datapath = datapath job.render_files = json.dumps(render_files) job.is_finished = True job.finished_at = dt.datetime.now() session.commit() session.close() except Exception as e: logging.exception(e)
def split(file_id): try: session = Session() file = session.query(File).filter(File.file_id == file_id).first() extract_pages, total_pages = get_pages(file.filepath, file.pages) filenames, filepaths, imagenames, imagepaths, filedims, imagedims, detected_areas = ( {} for i in range(7)) for page in extract_pages: # extract into single-page PDF save_page(file.filepath, page) filename = "page-{}.pdf".format(page) filepath = os.path.join(conf.PDFS_FOLDER, file_id, filename) imagename = "".join([filename.replace(".pdf", ""), ".png"]) imagepath = os.path.join(conf.PDFS_FOLDER, file_id, imagename) # convert single-page PDF to PNG gs_call = "-q -sDEVICE=pngalpha -dBackgroundColor=16#000000 -o {} -r300 {}".format( imagepath, filepath) gs_call = gs_call.encode().split() null = open(os.devnull, "wb") with Ghostscript(*gs_call, stdout=null): pass null.close() filenames[page] = filename filepaths[page] = filepath imagenames[page] = imagename imagepaths[page] = imagepath filedims[page] = get_file_dim(filepath) imagedims[page] = get_image_dim(imagepath) lattice_areas, stream_areas = (None for i in range(2)) # lattice parser = Lattice() tables = parser.extract_tables(filepath) if len(tables): lattice_areas = [] for table in tables: x1, y1, x2, y2 = table._bbox lattice_areas.append((x1, y2, x2, y1)) # stream parser = Stream() tables = parser.extract_tables(filepath) if len(tables): stream_areas = [] for table in tables: x1, y1, x2, y2 = table._bbox stream_areas.append((x1, y2, x2, y1)) detected_areas[page] = { "lattice": lattice_areas, "stream": stream_areas } file.extract_pages = json.dumps(extract_pages) file.total_pages = total_pages file.has_image = True file.filenames = json.dumps(filenames) file.filepaths = json.dumps(filepaths) file.imagenames = json.dumps(imagenames) file.imagepaths = json.dumps(imagepaths) file.filedims = json.dumps(filedims) file.imagedims = json.dumps(imagedims) file.detected_areas = json.dumps(detected_areas) session.commit() session.close() except Exception as e: logging.exception(e)