def step(self):
     '''
     Move pfd's and/or alto's from ocr to goobi
     ''' 
     error = None   
     try:
         self.getVariables()
         #===================================================================
         # Move BW-pdf file from OCR-server to Goobi
         #===================================================================
         self.moveFiles(self.ocr_pdf, self.goobi_pdf_bw)
         #===================================================================
         # Rename pdf from OCR-server to required name.
         # E.g. from 1234567890_Antikva.pdf to 1234567890_bw.pdf
         #===================================================================
         # A: get path for existing bw pdf just moved to Goobi
         old_bw_pdf_path = tools.getFirstFileWithExtension(self.goobi_pdf_bw, 'pdf')
         old_bw_pdf_path = os.path.join(self.goobi_pdf_bw,old_bw_pdf_path)
         # B: Rename to new path created in "getVariables"
         shutil.move(old_bw_pdf_path, self.new_bw_pdf_path)
     except ValueError as e:
         error = str(e)
         #return "Could not convert string to int - check config file."
     except (TransferError, TransferTimedOut, IOError) as e:
         error = str(e)
     except Exception as e:
         error = str(e)
     return error
def getProcessTitle(path, config_file):
	'''
	We assume that the process title for the given directory
	is the same as the name of the toc file minus .toc
	'''
	config = configparser.RawConfigParser()
	config.read(config_file)
	toc_dir = os.path.join(path, config.get('process_folder_structure', 'metadata_toc_path'))
	toc_name = tools.getFirstFileWithExtension(toc_dir, 'toc')
	return toc_name[:-4]
def tocExists(toc_dir):
    '''
    Ensure a .toc file exists in toc directory
    Return filename or None
    '''
    try:
        toc = tools.getFirstFileWithExtension(toc_dir, '.toc')
    except IOError:
        return False
    return toc
def pageCountMatches(pdf_input_dir,input_files_dir,valid_exts):
    '''
    Compare num pages in pdfinfo with pages in input 
    picture directory. 
    return boolean 
    '''
    pdf = tools.getFirstFileWithExtension(pdf_input_dir, '.pdf')
    pdfInfo = tools.pdfinfo(os.path.join(pdf_input_dir, pdf))
    numPages = int(pdfInfo['Pages'])
    numInputFiles = tools.getFileCountWithExtension(input_files_dir,valid_exts)
    return numPages == numInputFiles
 def getVariables(self):
     '''
     This method pulls in all the variables
     from the command line and the config file 
     that are necessary for its running.
     We need a path to our toc file, our meta.xml
     and a link to our DBC data service (eXist API).
     Errors in variables will lead to an 
     Exception being thrown.
     '''
     self.page_offset = None
     self.issnSet = False
     
     process_path = self.command_line.process_path
     toc_dir = os.path.join(
         process_path, 
         self.getConfigItem('metadata_toc_path', section='process_folder_structure')
     )
     toc_name = tools.getFirstFileWithExtension(toc_dir, '.toc')
     self.toc_file_path = os.path.join(toc_dir, toc_name)
     
     
     self.service_url = self.getConfigItem('dbc_service', section='dbc')
     self.meta_file = os.path.join(
         self.command_line.process_path, 
         self.getConfigItem('metadata_goobi_file', section='process_files')
     )
     # Parse initial Goobi METS file to a dictionary tree for processing
     self.meta_data,_ = dict_tools.parseXmlToDict(self.meta_file)
     
     # For pdf info
     pdf_input = self.getConfigItem('doc_limbpdf_path',
                                    section= 'process_folder_structure')
     self.pdf_input_dir = os.path.join(process_path, pdf_input)
     
     # parse for overlapping articles
     self.overlapping_articles = self.getSetting('overlapping_articles',
                                                 'bool',default=True)
     # parse boolean from command line - for overlapping articles
     self.default_language = self.getSetting('default_language',
                                             'string',default='da')
    def getVariables(self):
        '''
        Ensure we have the variables necessary to execute the script
        Tools will throw an Exception otherwise
        '''
        process_path = self.command_line.process_path
        mets_file_name = self.getConfigItem('metadata_goobi_file', None, 'process_files')
        self.mets_file = os.path.join(process_path, mets_file_name)
        
        self.ojs_root = self.getConfigItem('ojs_root')
        ojs_metadata_dir = self.getConfigItem('metadata_ojs_path', None, 'process_folder_structure')
        self.ojs_metadata_dir = os.path.join(process_path, ojs_metadata_dir)

        pdf_path = self.getConfigItem('doc_limbpdf_path', None, 'process_folder_structure')
        abs_pdf_path = os.path.join(process_path, pdf_path)
        self.pdf_name = tools.getFirstFileWithExtension(abs_pdf_path, '.pdf')
        self.pdf_file = os.path.join(abs_pdf_path, self.pdf_name)
        # TODO: check files in 'doc_pdf_path' instead of 'doc_limbpdf_path'
        # 'doc_limbpdf_path' contains the splitted pdf-files
        tools.ensureFilesExist(self.mets_file)
        tools.ensureDirsExist(self.ojs_metadata_dir)
        
        # parse boolean from command line
        self.overlapping_articles = self.getSetting('overlapping_articles', bool, default=True)
        
        # Get path to generate ojs_dir -> system means "define it from system variables"
        self.ojs_journal_path = self.getSetting('ojs_journal_path', default='system')
        # we also need the required issue fields
        req_fields = self.getConfigItem('issue_required_fields')
        self.issue_required_fields = req_fields.split(';')
        opt_fields = self.getConfigItem('issue_optional_fields')
        self.issue_optional_fields = opt_fields.split(';')
        
        # Set namespaces
        self.mets_ns = 'http://www.loc.gov/METS/'
        self.goobi_ns = 'http://meta.goobi.org/v1.5.1/'
        # Set sections
        self.front_matter = []
        self.articles = []
        self.back_matter = []
Example #7
0
 def getPdf(self):
     self.pdf_name = tools.getFirstFileWithExtension(self.pdf_input_dir, ".pdf")
     self.pdf_path = os.path.join(self.pdf_input_dir, self.pdf_name)
     self.pdfinfo = tools.pdfinfo(self.pdf_path)