def write_crops(file_name, cropped_tables=None, cropped_text=None, temp_table_path=TABLE_FOLDER, temp_text_path=TEXT_FOLDER, page_number=None): """ Writes table and text images under table and text folder :param file_name: :param cropped_tables: list of pillow images :param cropped_text: list of pillow images :param temp_table_path: :param temp_text_path: :return: None """ i = 0 logger.info('Writing cropped tables...') table_paths = [] text_path = None if cropped_tables is not None: for ct in cropped_tables: new_file_path = \ os.path.join(temp_table_path, str(file_name), 'table_pag_{pag_num}_{c}.jpeg'.format(pag_num=page_number, c=i)) ct = ct.convert('L') logger.info('Deskewing table...') sd = deskew.Deskew(input_numpy=np.asarray(ct), output_numpy=True) de_skewed_image_np = sd.run() logger.info('Deskew done') ct = Image.fromarray(de_skewed_image_np) ct = ct.convert(mode='L') try: ct.save(new_file_path, dpi=(EXTRACTION_DPI, EXTRACTION_DPI)) logger.info('Image_{} wrote on disk'.format(new_file_path)) except IOError or ValueError as e: raise OutputError('Cannot write image on disk: \n{}'.format(e)) i += 1 table_paths.append(new_file_path) logger.info('Writing cropped tables done.') else: logger.info('No tables to write on disk') if cropped_text is not None: logger.info('Writing cropped text...') # for cl in cropped_text: new_file_path = os.path.join(temp_text_path, str(file_name), 'text_pag_{}.jpeg'.format(page_number)) # ct_l = cl.convert('L') try: cropped_text.save(new_file_path, dpi=(EXTRACTION_DPI, EXTRACTION_DPI)) logger.info('Image_{} wrote on disk'.format(new_file_path)) except IOError or ValueError as e: raise OutputError('Cannot write image on disk: \n{}'.format(e)) # i += 1 logger.info('Writing cropped text done.') text_path = new_file_path return table_paths, text_path
def write_image_on_disk(file_name, pil_image, page=0, path=PATH_TO_EXTRACTED_IMAGES): """ Writes image on disk :param file_name: name of original file :param pil_image: numpy array greyscale image :param page: page counter from upward function. :param path: path/to/folder where to write images :return: """ logger.info('Writing temp images on disk...') path_to_image = os.path.join(path, '{fn}_page_{c}.jpeg'.format(fn=file_name, c=page)) try: pil_image.save(path_to_image, dpi=(EXTRACTION_DPI, EXTRACTION_DPI)) logger.info('Image_{} wrote on disk'.format(page)) except IOError or ValueError as e: raise OutputError( message='Cannot write image on disk: \n{}'.format(e) )
def do_tesseract_on_tables(table_path, destination_pdf_path=TABLE_FOLDER): """ :param table_path: :param destination_pdf_path: :return: """ # takes only file name without extension input_file_name = os.path.basename(table_path).split(os.extsep)[0] # take the name of the folder in which the images are stored, that is the name of the original pdf pdf_name = os.path.dirname(table_path).split(os.path.sep)[-1] # checking if file exists if not os.path.isfile(table_path): raise InputError('{} not found'.format(table_path)) input_file = table_path # Define config parameters. # '-l eng' for using the English language # '--oem 1' for using LSTM OCR Engine # --oem 2 for using Legacy + LSTM engines NOT AVAILABLE IN ITALIAN # '--psm 12' for sparse text with OSD. 3 is default and it's not working bad. config = '-l ita --oem 1 --psm 12 pdf' config_list = config.split(' ') # make a list of parameters args = [ "tesseract", input_file, # actual file to be analyzed input_file_name, # output file name *config_list, # extract all parameters inside this array 'pdf' ] start_folder = os.path.join(destination_pdf_path, pdf_name) proc = Popen( args, stdin=PIPE, stdout=PIPE, stderr=STDOUT, cwd=start_folder ) # do the actual job on CL output, outerr = proc.communicate() if proc.returncode == 0: # Everything went well logger.info("pdf was successfully extracted from: {}".format(input_file_name)) logger.info('Tesseract output: {}'.format(output)) # now extracting tables with tabula tabula_input_path = os.path.join(destination_pdf_path, pdf_name, '{}.pdf'.format(input_file_name)) if not os.path.isfile(tabula_input_path): raise InputError('{} was not found. Maybe was not created?'.format(tabula_input_path)) tabula_output_path = os.path.join(destination_pdf_path, pdf_name, '{}.csv'.format(input_file_name)) try: tabula.convert_into( tabula_input_path, output_path=tabula_output_path, output_format='csv', pages='all' ) except Exception as e: raise OutputError('Tabula is not performing well...\n{}'.format(e)) else: # something went wrong logger.error("Error while extracting pdf from {}".format(input_file)) logger.error("Tesseract Output: {}".format(output)) raise OutputError('Tesseract is not performing well...\n{}' .format("Tesseract Error: {}".format(outerr)))
def from_pdf_to_pil_generator(file_path, temp_folder=TEMP_IMG_FOLDER_FROM_PDF, thread_name=None): """ Create a page generator from pdf to make it load less RAM as it takes one page at a once. It read a page at once from pdf, then acquire it in RAM and offer as generator. It temporarly write the image in temp_folder, then it delete it automatically :param file_path: path/to/file.pdf :param thread_name: name of the thread in case of batch process :param temp_folder: path/to/folder to store temp image before acquiring it in RAM :return: PIL generator. Return None if nothing is found """ if not os.path.isfile(file_path): raise InputError(message='{} not found'.format(file_path)) else: page = 1 # logger.info("Creating page generator from {path}...".format(path=file_path)) if not os.path.isdir(temp_folder): try: os.makedirs(temp_folder) logger.info('Temp folder for extraction written on disk') except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise OutputError(message=exc) else: logger.info( '{} already exists. No need to create it'.format( temp_folder)) # Extract one page at a once. The iterator goes from first page to last until it reaches the end. In that case a # StopIteraton is raised. # Uses pdftoppm while True: args = [ "pdftoppm", "-l", str(page), "-f", str(page), "-r", str(EXTRACTION_DPI), "-gray", file_path, os.path.join(temp_folder, "temp-{}".format(thread_name)) ] # args.append(item for item in config_list) proc = Popen( args, stdin=PIPE, stdout=PIPE, stderr=STDOUT, # cwd=os.path.join(temp_folder) ) output, outerr = proc.communicate() if proc.returncode == 0: # Everything went well logger.info("Page {} successfully extracted".format(page)) # checking if the number of pages goes up to 999 pages. In the case that the number of pages is > 10, # the temp file number of the first page will be 01 instead of 1. If num_pages > 100, then 001 instead of 1. # here we check if temp file exists, if not we check the 01 one and so on. fp = os.path.join( temp_folder, 'temp-{tn}-{n}.pgm'.format(n=page, tn=thread_name)) if page < 10: if not os.path.isfile(fp): fp = os.path.join( temp_folder, 'temp-{tn}-0{n}.pgm'.format(n=page, tn=thread_name)) if not os.path.isfile(fp): fp = os.path.join( temp_folder, 'temp-{tn}-00{n}.pgm'.format(n=page, tn=thread_name)) elif 11 <= page <= 100: if not os.path.isfile(fp): fp = os.path.join( temp_folder, 'temp-{tn}-0{n}.pgm'.format(n=page, tn=thread_name)) try: img = Image.open(fp) # explicit copy of image so we can delete it from disk safely img = copy.deepcopy(img) if os.path.exists(fp): os.remove(fp) # convert image to greyscale mode img.convert(mode='L') page += 1 # return it as a generator yield img # return img except FileNotFoundError as e: raise InputError(message=e) # case mostly used for stopping iteration when EOF else: if outerr is None: logger.warning('pdftoppm output: {}'.format(output)) logger.warning('Probably reached end of file.') raise StopIteration else: logger.error('Something went wrong...') logger.error('pdftoppm output: {}'.format(output)) raise InputError( message='pdftoppm error: {}'.format(outerr))
def create_temp_folders(file_name, temp_table_folder=TABLE_FOLDER, temp_text_folder=TEXT_FOLDER): """ Clear any existing table/file_name and text/file_name folder for creating new images :param file_name: :param temp_table_folder: :param temp_text_folder: :return: None """ logger.info('Clear and create temp file for images from pdf') if not os.path.isdir(temp_table_folder): # creates folder for table images per page try: os.makedirs(temp_table_folder) logger.info('{} created successfully'.format(temp_table_folder)) except OSError as exc: if exc.errno != errno.EEXIST: raise OutputError( '{} was not created correctly.'.format(temp_table_folder)) else: logger.info('{} already present'.format(temp_table_folder)) # creates folder for text images per page logger.info(temp_text_folder + ' folder created successfully') if not os.path.isdir(temp_text_folder): try: os.makedirs(temp_text_folder) except OSError as exc: if exc.errno != errno.EEXIST: raise OutputError( '{} was not created correctly.'.format(temp_text_folder)) else: logger.info('{} already present'.format(temp_text_folder)) if os.path.isdir(os.path.join(temp_table_folder, str(file_name))): logger.info('Clearing table temp folder from existing files...') # shutil.rmtree(os.path.join(temp_table_folder, str(file_name)), ignore_errors=True) logger.info('Clear done') if os.path.isdir(os.path.join(temp_text_folder, str(file_name))): logger.info('Clearing text temp folder from existing files...') # shutil.rmtree(os.path.join(temp_text_folder, str(file_name)), ignore_errors=True) logger.info('Clear done') try: logger.info('Creating {}...'.format(temp_table_folder)) os.makedirs(os.path.join(temp_table_folder, str(file_name))) logger.info(temp_table_folder + ' created') except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise OutputError('{} was not created.'.format(temp_table_folder)) else: logger.info('{} already present'.format(temp_table_folder)) try: logger.info('Creating {}...'.format(temp_text_folder)) os.makedirs(os.path.join(temp_text_folder, str(file_name))) logger.info(temp_text_folder + ' created') except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise OutputError('{} was not created'.format(temp_text_folder)) else: logger.info('{} already present'.format(temp_text_folder))