コード例 #1
0
    def createMetsFile(self):
        """
        Given a toc object consisting of articles with dbc ids
        use the DBC service to 	generate data for each article.
        When all data is created, append this to the exising
        meta.xml data
        """
        # legr: Parse the META.XML and put it into a dictionary tree
        dt, _ = dict_tools.parseXmlToDict(self.meta_file)

        # legr: Dont do anything if there already are FILE_nnnn's and PHYS_nnnn's in the META.XML
        # legr: todo : what if a step is pushed back because of missing images. When they are added, this doesn't get
        # legr: todo : updated? I think we need a possibilty to clean the mets-file.
        if not mets_tools.containsImages(dt):
            # legr: we are here because META.XML was empty, so FILE_nnnn and PHYS_nnnn references to actual files
            # legr: in /master_orig/
            dt = mets_tools.addImages(dt, self.img_src)
            # legr: now update the META.XML with the references from above
            # legr: it seems the overall META.XML file structure also is somewhat reformatted - METS standard?
            xml_tools.writeDictTreeToFile(dt, self.meta_file)
 def getVariables(self):
     '''
     This method pulls in all the variables
     from the command line and the config file 
     that are necessary for its running.
     We need a path to our toc file, our meta.xml
     and a link to our DBC data service (eXist API).
     Errors in variables will lead to an 
     Exception being thrown.
     '''
     self.page_offset = None
     self.issnSet = False
     
     process_path = self.command_line.process_path
     toc_dir = os.path.join(
         process_path, 
         self.getConfigItem('metadata_toc_path', section='process_folder_structure')
     )
     toc_name = tools.getFirstFileWithExtension(toc_dir, '.toc')
     self.toc_file_path = os.path.join(toc_dir, toc_name)
     
     
     self.service_url = self.getConfigItem('dbc_service', section='dbc')
     self.meta_file = os.path.join(
         self.command_line.process_path, 
         self.getConfigItem('metadata_goobi_file', section='process_files')
     )
     # Parse initial Goobi METS file to a dictionary tree for processing
     self.meta_data,_ = dict_tools.parseXmlToDict(self.meta_file)
     
     # For pdf info
     pdf_input = self.getConfigItem('doc_limbpdf_path',
                                    section= 'process_folder_structure')
     self.pdf_input_dir = os.path.join(process_path, pdf_input)
     
     # parse for overlapping articles
     self.overlapping_articles = self.getSetting('overlapping_articles',
                                                 'bool',default=True)
     # parse boolean from command line - for overlapping articles
     self.default_language = self.getSetting('default_language',
                                             'string',default='da')
コード例 #3
0
    data = dict()
    for elem in metadata:
        name = elem.getAttribute('name')
        if name in required_fields:
            data[name] = elem.firstChild.nodeValue
    for item in required_fields:
        if item not in data: 
            raise DataError("{0} missing value {1}".format(anchor_file, item))
    return data

if __name__ == '__main__':
    image_src = '/opt/digiverso/goobi/metadata/201/images/master_orig'
    src = '/opt/digiverso/goobi/metadata/201/meta.xml'
    #src = '/opt/digiverso/goobi/metadata/194/meta - complete data.xml'
    dest = os.path.join(os.path.dirname(src),'meta_new.xml')
    dict_tree,ns  = dict_tools.parseXmlToDict(src)
    
    # Add images - i.e. create phys struct map, file sec and set pathimages
    if not mets_tools.containsImages(dict_tree):
        dict_tree = mets_tools.addImages(dict_tree,image_src) 
    
    content = [{'name': 'Abstract', 'data' : 'From the Roman Empire...' }, 
               {'name' : 'TitleDocMain', 'data' : 'Return of the oppressed'},
               {'name' : 'Author', 'type' : 'person', 'fields' :
                    [{'tag' : 'displayName', 'data' : 'Turchin, Peter'},
                     {'tag' : 'firstName', 'data' : 'Peter'},
                     {'tag' : 'lastName', 'data' : 'Turchin'}]
                }]
    new_doc_struct = {'content':content,
                      'doc_type':'Article',
                      'start_page':5,