def main(): ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.default, manage_root_logger=True) log.info(f"Starting OCRmyPDF watcher with config:\n" f"Input Directory: {INPUT_DIRECTORY}\n" f"Output Directory: {OUTPUT_DIRECTORY}\n" f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}") log.debug(f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n" f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n" f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n" f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n" f"DESKEW: {DESKEW}\n" f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n" f"LOGLEVEL: {LOGLEVEL}\n") handler = HandleObserverEvent(patterns=PATTERNS) observer = Observer() observer.schedule(handler, INPUT_DIRECTORY, recursive=True) observer.start() try: while True: time.sleep(1) except KeyboardInterrupt: observer.stop() observer.join()
def main(): ocrmypdf.configure_logging( verbosity=ocrmypdf.Verbosity.default, manage_root_logger=True ) log.setLevel(LOGLEVEL) log.info( f"Starting OCRmyPDF watcher with config:\n" f"Input Directory: {INPUT_DIRECTORY}\n" f"Output Directory: {OUTPUT_DIRECTORY}\n" f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}" ) log.debug( f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n" f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n" f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n" f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n" f"DESKEW: {DESKEW}\n" f"ARGS: {OCR_JSON_SETTINGS}\n" f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n" f"USE_POLLING: {USE_POLLING}\n" f"LOGLEVEL: {LOGLEVEL}\n" ) if 'input_file' in OCR_JSON_SETTINGS or 'output_file' in OCR_JSON_SETTINGS: log.error('OCR_JSON_SETTINGS should not specify input file or output file') sys.exit(1) handler = HandleObserverEvent(patterns=PATTERNS) if USE_POLLING: observer = PollingObserver() else: observer = Observer() observer.schedule(handler, INPUT_DIRECTORY, recursive=True) observer.start() try: while True: time.sleep(1) except KeyboardInterrupt: observer.stop() observer.join()
def pdf2pdfa(args): ok = False if args['mime_type'] == 'application/pdf': args['tmp_file_path'] = args['source_file_path'] # WAIT: Legg inn ekstra sjekk her om hva som skal gjøres hvis ocr = True if args['version'] in ('1a', '1b', '2a', '2b'): file_copy(args) if os.path.exists(args['norm_file_path']): ok = True return ok ocrmypdf.configure_logging(-1) result = ocrmypdf.ocr(args['tmp_file_path'], args['norm_file_path'], tesseract_timeout=0, progress_bar=False, skip_text=True) if str(result) == 'ExitCode.ok': ok = True return ok
else: start_dir = '.' if len(sys.argv) > 2: log_file = sys.argv[2] else: log_file = script_dir + '/ocr-tree.log' logging.basicConfig( level=logging.INFO, format='%(asctime)s %(message)s', filename=log_file, filemode='w', ) ocrmypdf.configure_logging(ocrmypdf.Verbosity.default) for dir_name, _subdirs, file_list in os.walk(start_dir): logging.info(dir_name + '\n') os.chdir(dir_name) for filename in file_list: file_ext = os.path.splitext(filename)[1] if file_ext == '.pdf': full_path = dir_name + '/' + filename print(full_path) result = ocrmypdf.ocr(filename, filename, deskew=True) if result == ocrmypdf.ExitCode.already_done_ocr: print("Skipped document because it already contained text") elif result == ocrmypdf.ExitCode.ok: print("OCR complete") logging.info(result)