def step(self): ''' Move pfd's and/or alto's from ocr to goobi ''' error = None try: self.getVariables() #=================================================================== # Move BW-pdf file from OCR-server to Goobi #=================================================================== self.moveFiles(self.ocr_pdf, self.goobi_pdf_bw) #=================================================================== # Rename pdf from OCR-server to required name. # E.g. from 1234567890_Antikva.pdf to 1234567890_bw.pdf #=================================================================== # A: get path for existing bw pdf just moved to Goobi old_bw_pdf_path = tools.getFirstFileWithExtension(self.goobi_pdf_bw, 'pdf') old_bw_pdf_path = os.path.join(self.goobi_pdf_bw,old_bw_pdf_path) # B: Rename to new path created in "getVariables" shutil.move(old_bw_pdf_path, self.new_bw_pdf_path) except ValueError as e: error = str(e) #return "Could not convert string to int - check config file." except (TransferError, TransferTimedOut, IOError) as e: error = str(e) except Exception as e: error = str(e) return error
def getProcessTitle(path, config_file): ''' We assume that the process title for the given directory is the same as the name of the toc file minus .toc ''' config = configparser.RawConfigParser() config.read(config_file) toc_dir = os.path.join(path, config.get('process_folder_structure', 'metadata_toc_path')) toc_name = tools.getFirstFileWithExtension(toc_dir, 'toc') return toc_name[:-4]
def tocExists(toc_dir): ''' Ensure a .toc file exists in toc directory Return filename or None ''' try: toc = tools.getFirstFileWithExtension(toc_dir, '.toc') except IOError: return False return toc
def pageCountMatches(pdf_input_dir,input_files_dir,valid_exts): ''' Compare num pages in pdfinfo with pages in input picture directory. return boolean ''' pdf = tools.getFirstFileWithExtension(pdf_input_dir, '.pdf') pdfInfo = tools.pdfinfo(os.path.join(pdf_input_dir, pdf)) numPages = int(pdfInfo['Pages']) numInputFiles = tools.getFileCountWithExtension(input_files_dir,valid_exts) return numPages == numInputFiles
def getVariables(self): ''' This method pulls in all the variables from the command line and the config file that are necessary for its running. We need a path to our toc file, our meta.xml and a link to our DBC data service (eXist API). Errors in variables will lead to an Exception being thrown. ''' self.page_offset = None self.issnSet = False process_path = self.command_line.process_path toc_dir = os.path.join( process_path, self.getConfigItem('metadata_toc_path', section='process_folder_structure') ) toc_name = tools.getFirstFileWithExtension(toc_dir, '.toc') self.toc_file_path = os.path.join(toc_dir, toc_name) self.service_url = self.getConfigItem('dbc_service', section='dbc') self.meta_file = os.path.join( self.command_line.process_path, self.getConfigItem('metadata_goobi_file', section='process_files') ) # Parse initial Goobi METS file to a dictionary tree for processing self.meta_data,_ = dict_tools.parseXmlToDict(self.meta_file) # For pdf info pdf_input = self.getConfigItem('doc_limbpdf_path', section= 'process_folder_structure') self.pdf_input_dir = os.path.join(process_path, pdf_input) # parse for overlapping articles self.overlapping_articles = self.getSetting('overlapping_articles', 'bool',default=True) # parse boolean from command line - for overlapping articles self.default_language = self.getSetting('default_language', 'string',default='da')
def getVariables(self): ''' Ensure we have the variables necessary to execute the script Tools will throw an Exception otherwise ''' process_path = self.command_line.process_path mets_file_name = self.getConfigItem('metadata_goobi_file', None, 'process_files') self.mets_file = os.path.join(process_path, mets_file_name) self.ojs_root = self.getConfigItem('ojs_root') ojs_metadata_dir = self.getConfigItem('metadata_ojs_path', None, 'process_folder_structure') self.ojs_metadata_dir = os.path.join(process_path, ojs_metadata_dir) pdf_path = self.getConfigItem('doc_limbpdf_path', None, 'process_folder_structure') abs_pdf_path = os.path.join(process_path, pdf_path) self.pdf_name = tools.getFirstFileWithExtension(abs_pdf_path, '.pdf') self.pdf_file = os.path.join(abs_pdf_path, self.pdf_name) # TODO: check files in 'doc_pdf_path' instead of 'doc_limbpdf_path' # 'doc_limbpdf_path' contains the splitted pdf-files tools.ensureFilesExist(self.mets_file) tools.ensureDirsExist(self.ojs_metadata_dir) # parse boolean from command line self.overlapping_articles = self.getSetting('overlapping_articles', bool, default=True) # Get path to generate ojs_dir -> system means "define it from system variables" self.ojs_journal_path = self.getSetting('ojs_journal_path', default='system') # we also need the required issue fields req_fields = self.getConfigItem('issue_required_fields') self.issue_required_fields = req_fields.split(';') opt_fields = self.getConfigItem('issue_optional_fields') self.issue_optional_fields = opt_fields.split(';') # Set namespaces self.mets_ns = 'http://www.loc.gov/METS/' self.goobi_ns = 'http://meta.goobi.org/v1.5.1/' # Set sections self.front_matter = [] self.articles = [] self.back_matter = []
def getPdf(self): self.pdf_name = tools.getFirstFileWithExtension(self.pdf_input_dir, ".pdf") self.pdf_path = os.path.join(self.pdf_input_dir, self.pdf_name) self.pdfinfo = tools.pdfinfo(self.pdf_path)