def job(x): # document conversion global pdf2txt global output_dir global count global upload y = x.replace('.pdf', '.txt') if output_dir: y = output_dir + y.split('/')[-1] if not os.path.isfile(y): if output_dir: os.system('python3 {} {} > {}'.format(pdf2txt, x, y)) else: os.system('python3 {} {}'.format(pdf2txt, x)) if os.stat(y).st_size <= MIN_BYTES: logging.info('{}: File Size unsatisfactory. Performing OCR'.format(x)) ocr.pdfocr2txt(x, y, resolution=resolution, tmp=tmp) logging.info('{} Done'.format(x)) else: logging.info('{} already a converted file'.format(x)) count.value += 1 logging.info('Complete {} out of {}'.format(int(count.value), total)) return y
def pdf_to_txt(input_pdf, output_txt): # document conversion try: global pdf2txt if not os.path.isfile(output_txt): if os.path.isfile(input_pdf): os.system('python3 {} {} > {}'.format( pdf2txt, input_pdf, output_txt)) else: logging.info('{}: File not found!'.format(input_pdf)) return False if os.stat(output_txt).st_size <= MIN_BYTES: logging.info( '{}: File Size unsatisfactory. Performing OCR'.format(input_pdf)) ocr.pdfocr2txt(input_pdf, output_txt, resolution=resolution, tmp=tmp) logging.info('{} Done'.format(input_pdf)) return True else: logging.info('{} already a converted file'.format(input_pdf)) return False except Exception as e: logging.error( "Exception occurred while converting file %s to txt", input_pdf, exc_info=True) return False