コード例 #1
0
 def step(self):
     '''
     Move altos, toc, pdf from limb to goobi
     ''' 
     error = None   
     try:
         self.getVariables()
         # check if files already have been copied:
         if (not self.ignore_goobi_folder and 
             limb_tools.alreadyMoved(self.goobi_toc,self.goobi_pdf,
                                     self.input_files,self.goobi_altos,
                                       self.valid_exts)):
             return error
         tools.ensureDirsExist(self.limb_altos, self.limb_toc, self.limb_pdf)
         self.moveFiles(self.limb_altos, self.goobi_altos)
         self.moveFiles(self.limb_toc, self.goobi_toc)
         self.moveFiles(self.limb_pdf, self.goobi_pdf)
         # Delete the empty process folder in LIMBs output folder
         try:
             os.rmdir(self.limb_process_root)
         except OSError:
             msg = 'Process folder "{0}" on LIMB could not be deleted.'
             msg = msg.format(self.limb_process_root)
             self.info_message(msg)
     except ValueError as e:
         return e.strerror
         #return "Could not convert string to int - check config file."
     except (TransferError, TransferTimedOut, IOError) as e:
         return e.strerror
     return None
コード例 #2
0
 def step(self):
     '''
     This class checks the output from a LIMB process
     to see if it matches certain criteria
     Require params process_title from command line
     Other values taken from config.ini
     In the case of errors a message will be returned
     and sent to the previous step.
     '''
     error = None
     try:
         self.getVariables()
         # Check files on goobi-server, if they already have been moved
         if (not self.ignore_goobi_folder and 
             limb_tools.alreadyMoved(self.goobi_toc,self.goobi_pdf,
                                     self.input_files_dir,self.goobi_altos,
                                       self.valid_exts)):
             return error
         tools.ensureDirsExist(self.limb_dir, self.alto_dir,
                               self.toc_dir, self.pdf_input_dir,
                               self.input_files_dir)
         limb_tools.performValidations(self.toc_dir,self.pdf_input_dir,
                                       self.input_files_dir,self.alto_dir,
                                       self.valid_exts)
         return None
     except IOError as e:
         return "IOError - {0}".format(e.strerror)
     except DataError as e: 
         return "Validation error - {0}.".format(e.strerror)
コード例 #3
0
    def getVariables(self):
        '''
        Get all required vars from command line + config
        and confirm their existence.
        Throws ValueError if config strings cannot be converted to input_files
        Throws IOError if necessary directories could not be found
        '''
        self.limb_process_root = os.path.join(self.getConfigItem('limb_output'), self.command_line.process_title)
        self.limb_altos = os.path.join(self.limb_process_root, self.getConfigItem('alto'))
        self.limb_toc = os.path.join(self.limb_process_root, self.getConfigItem('toc'))
        self.limb_pdf = os.path.join(self.limb_process_root, self.getConfigItem('pdf'))
        
        self.goobi_altos = os.path.join(self.command_line.process_path, 
            self.getConfigItem('metadata_alto_path', None, 'process_folder_structure'))
        self.goobi_toc = os.path.join(self.command_line.process_path, 
            self.getConfigItem('metadata_toc_path', None, 'process_folder_structure'))
        self.goobi_pdf = os.path.join(self.command_line.process_path, 
            self.getConfigItem('doc_limbpdf_path', None, 'process_folder_structure'))
        self.valid_exts = self.getConfigItem('valid_file_exts',None, self.valid_exts_section).split(';')
        # Get path for input-files in process folder
        process_path = self.command_line.process_path
        input_files = self.getConfigItem('img_master_path',
                                         section= self.folder_structure_section) 
        self.input_files = os.path.join(process_path,input_files)
        
        # Set flag for ignore if files already have been copied to goobi
        self.ignore_goobi_folder = self.getSetting('ignore_goobi_folder', bool, default=True)
        
        self.sleep_interval = int(self.getConfigItem('sleep_interval', None, 'copy_to_limb'))
        self.retries = int(self.getConfigItem('retries', None, 'copy_to_limb'))
        

        
        tools.ensureDirsExist(self.goobi_altos, self.goobi_toc, self.goobi_pdf)
コード例 #4
0
 def step(self):
     '''
     This class checks the output from a LIMB or OCR process
     to see if it matches certain criteria
     Require params process_title from command line
     Other values taken from config.ini
     In the case of errors a message will be returned
     and sent to the previous step.
     '''
     error = None
     try:
         self.getVariables()
         tools.ensureDirsExist(self.bw_pdf_input_dir,
                               self.color_pdf_input_dir,self.input_files_dir)
         # Check if color pdf is ok
         if not limb_tools.pageCountMatches(self.color_pdf_input_dir,self.input_files_dir,self.valid_exts):
             raise DataError('PDF page count does not match input picture count in "{0}"!'.format(self.color_pdf_input_dir))
         # Check if bw pdf is ok
         if not limb_tools.pageCountMatches(self.bw_pdf_input_dir,self.preprocessed_input_files,self.valid_exts):
             raise DataError('PDF page count does not match input picture count in "{0}"!'.format(self.bw_pdf_input_dir))
     except IOError as e:
         error = "IOError - {0}".format(e.strerror)
     except DataError as e: 
         error = "Validation error - {0}.".format(e.strerror)
     return error
コード例 #5
0
 def step(self):
     '''
     Copy PDF's from Goobi to webserver
     ''' 
     error = None   
     try:
         self.getVariables()
         tools.ensureDirsExist(self.source_folder_color, self.source_folder_bw)
         self.copyFiles(self.source_folder_color, self.dest_folder)
         self.copyFiles(self.source_folder_bw, self.dest_folder)
     except ValueError as e:
         return e.strerror
         #return "Could not convert string to int - check config file."
     except (TransferError, TransferTimedOut, IOError) as e:
         return e.strerror
     return None
コード例 #6
0
def getOptions():
	try:
		parser=OptionParser()
		parser.add_option('-d', '--dir', dest='root_dir', help='The directory containing all the process directories.')
		parser.add_option('-c', '--settings', dest='settings', 
			help='The settings file to be read by the scripts. If not specified, the system default will be assumed.')
		parser.add_option('-v', '--debug', dest='debug', help='Should the scripts run in debug mode?')
		(options, args) = parser.parse_args()
		ensureValidOptions(options, parser)
		tools.ensureDirsExist(options.root_dir)
	except IOError as e:
		print(e.strerror)
		print(parser.print_help())
		sys.exit(1)

	return options
コード例 #7
0
 def limbIsReady(self):
     """
     Check to see if LIMB is finished
     return boolean
     """
     try:
         # raises error if one of our directories is missing
         tools.ensureDirsExist(self.limb_dir, self.alto_dir, self.toc_dir, self.pdf_input_dir, self.input_files)
     except IOError as e:
         msg = "One of the output folder from LIMB is not yet created." " Waiting for LIMB to be ready. Error: {0}"
         msg = msg.format(e.strerror)
         self.debug_message(msg)
         return False
     if limb_tools.tocExists(self.toc_dir):
         return True
     if limb_tools.altoFileCountMatches(self.alto_dir, self.input_files):
         return True
     return False
コード例 #8
0
    def getVariables(self):
        """
        Get all required vars from command line + config
        and confirm their existence.
        """
        process_path = self.command_line.process_path
        mets_file_name = self.getConfigItem("metadata_goobi_file", None, self.process_files_section)
        self.mets_file = os.path.join(process_path, mets_file_name)

        process_path = self.command_line.process_path
        pdf_input = self.getConfigItem("doc_limbpdf_path", section=self.folder_structure_section)
        pdf_output = self.getConfigItem("doc_pdf_path", section=self.folder_structure_section)

        # Create paths for
        self.pdf_input_dir = os.path.join(process_path, pdf_input)
        self.pdf_output_dir = os.path.join(process_path, pdf_output)
        # raises exception if one of our directories is missing
        tools.ensureDirsExist(self.pdf_input_dir, self.pdf_output_dir)
        tools.ensureFilesExist(self.mets_file)
コード例 #9
0
    def getVariables(self):
        '''
        Ensure we have the variables necessary to execute the script
        Tools will throw an Exception otherwise
        '''
        process_path = self.command_line.process_path
        mets_file_name = self.getConfigItem('metadata_goobi_file', None, 'process_files')
        self.mets_file = os.path.join(process_path, mets_file_name)
        
        self.ojs_root = self.getConfigItem('ojs_root')
        ojs_metadata_dir = self.getConfigItem('metadata_ojs_path', None, 'process_folder_structure')
        self.ojs_metadata_dir = os.path.join(process_path, ojs_metadata_dir)

        pdf_path = self.getConfigItem('doc_limbpdf_path', None, 'process_folder_structure')
        abs_pdf_path = os.path.join(process_path, pdf_path)
        self.pdf_name = tools.getFirstFileWithExtension(abs_pdf_path, '.pdf')
        self.pdf_file = os.path.join(abs_pdf_path, self.pdf_name)
        # TODO: check files in 'doc_pdf_path' instead of 'doc_limbpdf_path'
        # 'doc_limbpdf_path' contains the splitted pdf-files
        tools.ensureFilesExist(self.mets_file)
        tools.ensureDirsExist(self.ojs_metadata_dir)
        
        # parse boolean from command line
        self.overlapping_articles = self.getSetting('overlapping_articles', bool, default=True)
        
        # Get path to generate ojs_dir -> system means "define it from system variables"
        self.ojs_journal_path = self.getSetting('ojs_journal_path', default='system')
        # we also need the required issue fields
        req_fields = self.getConfigItem('issue_required_fields')
        self.issue_required_fields = req_fields.split(';')
        opt_fields = self.getConfigItem('issue_optional_fields')
        self.issue_optional_fields = opt_fields.split(';')
        
        # Set namespaces
        self.mets_ns = 'http://www.loc.gov/METS/'
        self.goobi_ns = 'http://meta.goobi.org/v1.5.1/'
        # Set sections
        self.front_matter = []
        self.articles = []
        self.back_matter = []
コード例 #10
0
ファイル: wait_for_ocr.py プロジェクト: akademy/goobi-scripts
 def ocrIsReady(self):
     '''
     Check to see if OCR is finished
     return boolean
     '''
     try: 
         # raises error if one of our directories is missing
         tools.ensureDirsExist(self.pdf_input_dir, self.input_files)
     except IOError as e:
         msg = ('One of the output folder from OCR is not yet created.'
                ' Waiting for OCR to be ready. Error: {0}')
         msg = msg.format(e.strerror)
         self.debug_message(msg)
         return False
     # legr: we can use limb_tools generally - they are not Limb specific
     # we should rename them someday
     pdf_ok = limb_tools.pageCountMatches(self.pdf_input_dir,
                                          self.input_files,
                                          self.valid_exts)
     if pdf_ok:
         return True
     return False
コード例 #11
0
    def getVariables(self):
        '''
        This script pulls in all the variables
        from the command line and the config file 
        that are necessary for its running.
        Errors in variables will lead to an 
        Exception being thrown.
        We need the path to the OJS mount,
        the current process dir, the pdf dir,
        and the ojs xml dir.
        '''

        # Temporary, new processes should always have issn, so check should be in essential section
        # Initially assume we have an ISSN on the command line
        issn_missing = False
        try:
            self.issn = self.command_line.issn
        except AttributeError as e:
            self.debug_message("Warning, missing attribute. Details: {0}".format(e))
            # We dont have an ISSN on the commandline, so use old code
            issn_missing = True

        process_path = self.command_line.process_path

        # Temporary, until all new processes uses issn
        if issn_missing:
            mets_file_name = self.getConfigItem('metadata_goobi_file', None, 'process_files')
            mets_file = os.path.join(process_path, mets_file_name)

        ojs_mount = self.getConfigItem('ojs_mount')
        ojs_metadata_dir = self.getConfigItem('metadata_ojs_path',
                                              section= self.folder_structure_section)
        self.ojs_metadata_dir = os.path.join(process_path, ojs_metadata_dir)
        
        pdf_path = self.getConfigItem('doc_pdf_path',
                                      section= self.folder_structure_section)
        self.pdf_input_dir = os.path.join(process_path, pdf_path)

        # Temporary condition, until all new processes uses issn
        if issn_missing:
            issue_data = mets_tools.getIssueData(mets_file)
            # Get path to generate ojs_dir -> system means "define it from system variables"
            self.ojs_journal_path = self.getSetting('ojs_journal_path', default='system')
            if self.ojs_journal_path == 'system':
                volume_title = tools.parseTitle(issue_data['TitleDocMain'])
                # TODO: write this one back as a property?
                # self.goobi_com.addProperty(self.process_id, 'ojs_journal_path', volume_title, overwrite=True)
            else:
                volume_title = self.ojs_journal_path
            # volume_title = tools.parseTitle(issue_data['TitleDocMain'])
        else:
            # We have a process with issn, so:
            issn = self.command_line.issn
            ojs_journal_path = ojs.getJournalPath(self.ojs_server, issn)
            ojs_journal_folder = os.path.join(ojs_mount, ojs_journal_path)

        # Temporary condition, until all new processes uses issn
        if issn_missing:
            ojs_journal_folder = os.path.join(ojs_mount, volume_title)

        # Create folder and set owner to gid 1000 => ojs-group
        tools.find_or_create_dir(ojs_journal_folder,change_owner=1000)
        self.ojs_dest_dir = os.path.join(ojs_journal_folder,
                                         self.command_line.process_title)
        # Create folder and set owner to gid 1000 => ojs-group
        tools.find_or_create_dir(self.ojs_dest_dir,change_owner=1000)

        tools.ensureDirsExist(self.ojs_metadata_dir,
                              self.pdf_input_dir,
                              self.ojs_dest_dir)

        # Temporary condition, in the future, issn is always available
        if not issn_missing:
            self.debug_message("metadata_dir is %s" % self.ojs_metadata_dir)
            self.debug_message("pdf_input_dir is %s" % self.pdf_input_dir)
            self.debug_message("dest_dir is %s" % self.ojs_dest_dir)