def waitForPreprocessedImages(self): retry = 0 while retry <= self.pp_retry_num: # ================================================================== # Get current number of preprocessed images # ================================================================== pp_files = fs.getFilesInFolderWithExts(self.source_folder, self.valid_exts) # ================================================================== # Exit loop when preprocessed images are ready # ================================================================== if len(pp_files) == self.expected_image_count: # ============================================================== # Wait 30 sec to make sure images are completely copied # ============================================================== time.sleep(30) return True # ================================================================== # This shouldn't happen, but we have seen pdf's with duplicate pages, so better check # ================================================================== if len(pp_files) > self.expected_image_count: if len(pp_files) > 0: self.debug_message("Der er flere preprocesserede billeder ({}) end scannede billeder ({})" .format(pp_files, self.expected_image_count)) return False # ================================================================== # Wait "self.pp_retry_wait" seconds # ================================================================== retry += 1 self.debug_message("Preprocesserede billeder ikke klar, venter {} sek".format(self.pp_retry_wait)) self.debug_message("Retry {} of {}".format(retry, self.pp_retry_num)) time.sleep(self.pp_retry_wait) return False
def addBindingsToPdf(self): #======================================================================= # Get density for bw-pdf (i.e. DPI/PixelsPerInch) #======================================================================= density = pdf_tools.getDensity(src=self.pdf_bw_path,layer=0) #======================================================================= # Create temp folder for temp pdf-files #======================================================================= temp_folder = os.path.join(self.temp_root,self.process_title) tools.create_folder(temp_folder) #======================================================================= # Get path for first and last image #======================================================================= images = fs.getFilesInFolderWithExts(self.img_master_path, self.valid_exts, absolute=True) #======================================================================= # Create PDF of bindings (first and last image in master image folder) #======================================================================= front_image_path = images[0] end_image_path = images[-1] front_pdf_path = os.path.join(temp_folder,'front.pdf') end_pdf_path = os.path.join(temp_folder,'end.pdf') image_tools.compressFile(input_file = front_image_path, output_file = front_pdf_path, quality = self.quality, resize = self.resize, density = density) image_tools.compressFile(input_file = end_image_path, output_file = end_pdf_path, quality = self.quality, resize = self.resize, density = density) #======================================================================= # Add front and back-binding to pdf #======================================================================= pdf_list = [front_pdf_path,self.pdf_bw_path,end_pdf_path] temp_dest = os.path.join(temp_folder,self.process_title+'.pdf') pdf_tools.joinPdfFiles(pdf_list, temp_dest) #======================================================================= # Move new pdf from temp to bw-pdf location (overwrite) #======================================================================= shutil.move(temp_dest, self.pdf_bw_path) #======================================================================= # Delete temp_folder #======================================================================= fs.clear_folder(temp_folder, also_folder=True)
def createPdfFromFolder(src, file_dest,temp_folder, quality=50,resize_pct=50,valid_exts=['jpg','tif']): ''' Use ImageMagick to create one pdf from all the images in a folder and output to a given destination. Create a pdf of each image and place in temp folder. Merge output pdf-files to pdf-dest and remove temp folder. ''' image_paths = fs.getFilesInFolderWithExts(src, valid_exts) for image in image_paths: # Handle spaces in filenames image = '"' + image + '"' input_path = os.path.join(src,image) file_name,_ = os.path.splitext(image) output_file_name = file_name+'.pdf' output_path = os.path.join(temp_folder,output_file_name) image_tools.compressFile(input_path, output_path, quality, resize_pct) pdf_misc.mergePdfFilesInFolder(temp_folder,file_dest) fs.clear_folder(temp_folder,also_folder=True)
def step(self): error = None try: self.getVariables() msg = 'Copying files from {0} to {1} via transit {2}.' msg = msg.format(self.source_folder, self.hotfolder_dir, self.transit_dir) self.debug_message(msg) # ================================================================== # Wait for preprocessed images to be ready # Returns false if it times out # ================================================================== if not self.waitForPreprocessedImages(): pp_files = fs.getFilesInFolderWithExts(self.source_folder, self.valid_exts) raise Exception('Timed out or count error while waiting for pre-processing of ' 'images. Current number of processed images: ' '{0}. Expected amount: {1}'.format(pp_files, self.expected_image_count)) # ================================================================== # Copy files to OCR-server # ================================================================== self.debug_message("Start copy of preprocessed images to OCR-server") tools.copy_files(source = self.source_folder, dest = self.hotfolder_dir, transit = self.transit_dir, delete_original = False, wait_interval = self.retry_wait, max_retries = self.retry_num, logger = self.glogger, valid_exts = self.valid_exts) self.debug_message("Finished copy of preprocessed images to OCR-server") except errors.TransferError as e: error = e.strerror except errors.TransferTimedOut as e: error = e.strerror except Exception as e: error = str(e) return error
def getVariables(self): """ Get all required vars from command line + config and confirm their existence. """ process_title = self.command_line.process_title process_path = self.command_line.process_path # ====================================================================== # Path to folder with master image files # ====================================================================== mi_img = self.getConfigItem('img_master_path', section=self.folder_structure_section) self.master_folder = os.path.join(process_path, mi_img) # ====================================================================== # Path to folder with preprocessed files # ====================================================================== pp_img = self.getConfigItem('img_pre_processed_path', section=self.folder_structure_section) self.source_folder = os.path.join(process_path, pp_img) # ====================================================================== # legr: Get the correct OCR server for the process - antikva or fraktur # Break if argument somehow is missing or have an invalid name # ====================================================================== try: ocr_workflow_type = self.getSetting('ocr_workflow_type').lower() except KeyError: self.error_message('{0} er ikke givet med som variabel til scriptet.'.format('ocr_workflow_type')) if ocr_workflow_type == 'antikva': # legr: currently antikva on ocr-01 ocr_transitfolder = self.getSetting('ocr_antikva_transit') ocr_hotfolder = self.getSetting('ocr_antikva_hotfolder') elif ocr_workflow_type == 'fraktur': # legr: currently fraktur on ocr-02 ocr_transitfolder = self.getSetting('ocr_fraktur_transit') ocr_hotfolder = self.getSetting('ocr_fraktur_hotfolder') else: err = ('Variablen "{0}" fra kaldet af "{1}" skal enten vaere ' '"fraktur" eller "antikva", men er pt. "{2}".') err = err.format('ocr_workflow_type', self.name, ocr_workflow_type) self.error_message(err) self.transit_dir = os.path.join(ocr_transitfolder, process_title) self.hotfolder_dir = os.path.join(ocr_hotfolder, process_title) # ====================================================================== # Set retry wait time and retry count for copying files # ====================================================================== self.retry_wait = int(self.getConfigItem('retry_wait')) self.retry_num = int(self.getConfigItem('retry_num')) # ====================================================================== # Set valid extensions for image files to check as preprocessed # ====================================================================== self.valid_exts = self.getConfigItem('valid_file_exts', None, self.valid_exts_section).split(';') # ====================================================================== # Set variables for waiting for preprocessed images to be ready # ====================================================================== self.pp_retry_wait = int(self.getConfigItem('preprocess_retry_wait')) self.pp_retry_num = int(self.getConfigItem('preprocess_retry_num')) img_list = fs.getFilesInFolderWithExts(self.master_folder, self.valid_exts) # Source images miunus first and last image self.expected_image_count = len(img_list)-2