def parseTocFile(self):
     self.toc_data = TOC(self.toc_file_path,self.service_url,
                         self.pdfinfo, self.overlapping_articles,
                         self.glogger)
     #=======================================================================
     # Various checks of the toc-file
     #=======================================================================
     data_check = self.toc_data.erroneousPages()
     if data_check == 1:
         msg = ('NB!!! Der er kun en artikel for hæftet. Flere kan oprettes '
                ' via Goobis metadata-editor eller hæftet kan sendes '
                ' LIMB igen. Suk!')
         self.error_message(msg)
     elif data_check == 2:
         msg = ('NB!!! En fejl i LIMB har medført, at alle hæftets artikler '
                'har samme startside. Dette skal rettes manuelt i '
                'METS-editoren for hver enkelt artikel. Suk!')
         self.error_message(msg)
class AddArticlesToMetsFile( Step ):

    def setup(self):
        self.name = 'Indsæt indholdsfortegnelse i METS metadata'
        self.config_main_section = 'add_articles_to_mets_file'
        self.essential_config_sections = set( ['process_folder_structure',
                                               'process_files',
                                               'dbc'] )
        self.essential_commandlines = {
            'process_path' : 'folder'
        }

    def step(self):
        try:
            self.getVariables()
            self.getPdf()
            self.parseTocFile()
            self.isIssnSet()
            self.buildXml()
            self.writeXml()
        except ValueError as e:
            return str(e) + str(traceback.format_exc())
        except IOError as e:
            return str(e) + str(traceback.format_exc())
        except Exception as e:
            return str(e) + str(traceback.format_exc())

    def getVariables(self):
        '''
        This method pulls in all the variables
        from the command line and the config file 
        that are necessary for its running.
        We need a path to our toc file, our meta.xml
        and a link to our DBC data service (eXist API).
        Errors in variables will lead to an 
        Exception being thrown.
        '''
        self.page_offset = None
        self.issnSet = False
        
        process_path = self.command_line.process_path
        toc_dir = os.path.join(
            process_path, 
            self.getConfigItem('metadata_toc_path', section='process_folder_structure')
        )
        toc_name = tools.getFirstFileWithExtension(toc_dir, '.toc')
        self.toc_file_path = os.path.join(toc_dir, toc_name)
        
        
        self.service_url = self.getConfigItem('dbc_service', section='dbc')
        self.meta_file = os.path.join(
            self.command_line.process_path, 
            self.getConfigItem('metadata_goobi_file', section='process_files')
        )
        # Parse initial Goobi METS file to a dictionary tree for processing
        self.meta_data,_ = dict_tools.parseXmlToDict(self.meta_file)
        
        # For pdf info
        pdf_input = self.getConfigItem('doc_limbpdf_path',
                                       section= 'process_folder_structure')
        self.pdf_input_dir = os.path.join(process_path, pdf_input)
        
        # parse for overlapping articles
        self.overlapping_articles = self.getSetting('overlapping_articles',
                                                    'bool',default=True)
        # parse boolean from command line - for overlapping articles
        self.default_language = self.getSetting('default_language',
                                                'string',default='da')

    def getDBCData(self, article_id):
        url = self.service_url.format(article_id)
        return MarcXml.initFromWeb(url)

    def getPdf(self):
        self.pdf_name = tools.getFirstFileWithExtension(self.pdf_input_dir, '.pdf')
        self.pdf_path = os.path.join(self.pdf_input_dir, self.pdf_name)
        self.pdfinfo = tools.pdfinfo(self.pdf_path)

    def parseTocFile(self):
        self.toc_data = TOC(self.toc_file_path,self.service_url,
                            self.pdfinfo, self.overlapping_articles,
                            self.glogger)
        #=======================================================================
        # Various checks of the toc-file
        #=======================================================================
        data_check = self.toc_data.erroneousPages()
        if data_check == 1:
            msg = ('NB!!! Der er kun en artikel for hæftet. Flere kan oprettes '
                   ' via Goobis metadata-editor eller hæftet kan sendes '
                   ' LIMB igen. Suk!')
            self.error_message(msg)
        elif data_check == 2:
            msg = ('NB!!! En fejl i LIMB har medført, at alle hæftets artikler '
                   'har samme startside. Dette skal rettes manuelt i '
                   'METS-editoren for hver enkelt artikel. Suk!')
            self.error_message(msg)

    def writeXml(self):
        '''
        Write the xml generated back to file
        '''
        xml_tools.writeDictTreeToFile(self.meta_data,self.meta_file)

    def isIssnSet(self):
        '''
        Goes through and metadata in dmd_sec in mets.xml file to see if the ISSN
        field has been set. If so set self.issnSet to True else to False. 
        '''
        self.issnSet = mets_tools.hasMetadataField(self.meta_data,'ISSN')
    
    def addIssnToPeriodical(self, issn):
        '''
        Adds the field ISSN to the doc stuct type PeriodicalVolume, if it isn't
        already set.
        :param issn:
        '''
        self.issnSet = mets_tools.addFieldToDocType(self.meta_data,
                                                    'PeriodicalVolume',
                                                    'ISSN',issn)

    def buildXml(self):
        '''
        Given a toc object consisting of articles with dbc ids
        use the DBC service to     generate data for each article.
        When all data is created, append this to the exising
        meta.xml data
        '''
        self.createFrontMatterSection()
        self.createArticlesSection()
        self.createBackMatterSection()
        mets_tools.addOffsetToPhysicalStructMap(self.meta_data, self.toc_data.page_offset)
        self.meta_data = mets_tools.expandPagesFromChildrenToParent(self.meta_data)
        
    def createFrontMatterSection(self):
        articles = self.toc_data.getFrontMatterSection()
        if articles: self.createArticles(articles,'FrontMatter')
    
    def createArticlesSection(self):
        articles = self.toc_data.getArticlesSection()
        if articles: self.createArticles(articles,'Articles')
    
    def createBackMatterSection(self):
        articles = self.toc_data.getBackMatterSection()
        if articles: self.createArticles(articles,'BackMatter')
        
    def createArticles(self,section,section_type):
        if not mets_tools.docTypeExists(self.meta_data, section_type):
            # Create section, e.g. FrontMatter if it doesn't exists
            section_data = {'doc_type': section_type,'content': [{'name':'TitleDocMain','data':section_type}]}
            self.meta_data = mets_tools.addNewDocStruct(self.meta_data,section_data)
        articles = section.articles
        #TODO: Create articles-docstruct if not already there
        section_attrib = ('TYPE',section_type)
        for article in articles:
            article_data = self.createArticleData(article)
            if article_data and not self.articleExists(article_data):
                self.meta_data = mets_tools.addNewDocStruct(self.meta_data,
                                                            article_data,
                                                            section_attrib)
    
    def articleExists(self,article_data):
        article_title = [c['data'] for c in article_data['content']
                         if c['name'] == 'TitleDocMain'][0]
        start_page = article_data['start_page']
        end_page = article_data['end_page']
        if mets_tools.articleExists(self.meta_data, article_title,
                                    start_page,end_page):
            err = ('Article "{0}" already exist in METS-file. Possible '
                   'duplicate. Article is skipped.')
            err = err.format(article_title.encode('utf-8'))
            self.debug_message(err)
            return True
        else: return False
    
    def createArticleData(self, article):
        '''
        Create a metadata structure that can be 
        consumed by the Meta XML builder class.
        This takes the form of a list of dictionaries,
        with each dictionary representing a field or set of fields.
        For example: [{'name': 'Abstract', 'data' : 'From the Roman Empire...' }, 
            {'name' : 'TitleDocMain', 'data' : 'Return of the oppressed'},
            {'name' : 'Author', 'type' : 'person', 'fields' : [
                {'tag' : 'goobi:firstName', 'data' : 'Peter'},
                {'tag' : 'goobi:lastName', 'data' : 'Turchin'}
            ]}
        ]
        See the MetaXml class for more details.
        '''
        content = list()
        #=======================================================================
        # Set language
        #=======================================================================
        content.append({'name': 'DocLanguage', 'data':article.language})
        #=======================================================================
        # Add title
        #=======================================================================
        if article.article_id:
            content.append({'name': 'dbcMarcxID', 'data':article.article_id})
        #=======================================================================
        # Add title, update time and sub title
        #=======================================================================
        content.append({'name': 'TitleDocMain', 'data':article.title})
        if article.update_time:
            content.append({'name': 'UpdateTime', 'data':article.update_time})
        
        if article.sub_title:
            content.append({'name': 'TitleDocSub1', 'data':article.sub_title})
        #=======================================================================
        # Add subjects
        #=======================================================================
        for subject in article.subjects:
            content.append({'name': 'Subject', 'data':subject})
        #=======================================================================
        # Add description and content description
        #=======================================================================
        if article.description:
            content.append({'name': 'Description', 'data':article.description})
        if article.content_description:
            content.append({'name': 'ContentDescription', 'data':article.content_description})
            
        #=======================================================================
        # Add start and endpage
        #=======================================================================
        start_page = article.start_page
        end_page = article.end_page
        #=======================================================================
        # Add authors
        #=======================================================================
        if article.authors: # multiple authors or author from dbc-data
            for author in article.authors:
                given_name = author[0]
                family_name  = author[1]
                author_element = self.__createAuthorElement(given_name, family_name)
                if author_element: content.append(author_element)
        elif article.author:
            if len(article.author.split(' ')) > 1: # multiple names
                given_name, family_name = article.author.split(' ',1) 
            else:
                given_name = article.author 
                family_name = ''
            author_element = self.__createAuthorElement(given_name, family_name)
            if author_element: content.append(author_element)
        else:
            given_name = ''
            family_name = ''
            author_element = self.__createAuthorElement(given_name, family_name)
            if author_element: content.append(author_element)
            # TODO: Do check on the numbers of author names. If very long
            # raise a note to quality control. This is a larger implementation.
            # An example of a long author field:
            #    "Else Marie Pedersen i samarbejde med I�rn Pi� og Holger Rasmussen"
        # create elements for any other authors
        # TODO: routine to split up author field - e.g. use ';' to separate
        # authors.
        #=======================================================================
        # Add issn if it isnt set. ISSN lives in the metadata for the issue
        # but comes from articles in DBCs metadata. Thus we give the issue 
        # the ISSN from the article, if it isn't previously set. 
        #=======================================================================
        if not self.issnSet and article.issn:
            self.addIssnToPeriodical(article.issn)
        
        #=======================================================================
        # Join article element to be added to mets-file 
        #=======================================================================
        article_data = {'doc_type': 'Article',
                        'content': content,
                        'start_page': start_page,
                        'end_page': end_page}
        return article_data

    def __createAuthorElement(self, firstname, lastname):
        '''
        Given a firstname and a lastname, create 
        a list of dictionaries representing a single author
        in the following form:
        [{'tag' : 'firstName', 'data' : 'Peter'},
        {'tag' : 'lastName', 'data' : 'Turchin'}]
        If firstname and lastName are empty, it will return
        an empty hash
        '''
        author = dict()
        author['name'] = 'Author'
        author['type'] = 'person'
        author_fields = list()
        if firstname:
            firstname_elem = dict()
            firstname_elem['tag'] = 'firstName'
            firstname_elem['data'] = firstname
            author_fields.append(firstname_elem)
        if lastname:
            lastname_elem = dict()
            lastname_elem['tag'] = 'lastName'
            lastname_elem['data'] = lastname
            author_fields.append(lastname_elem)
        # build the best display name we can, given the 
        # data available to us
        display_name = dict()
        display_name['tag'] = 'displayName'
        if firstname and lastname:
            display_name['data'] = u"{0}, {1}".format(lastname, firstname)
            author_fields.append(display_name)
        elif lastname:
            display_name['data'] = lastname
            author_fields.append(display_name)
        elif firstname:
            display_name['data'] = firstname
            author_fields.append(display_name)

        if author_fields:
            author['fields'] = author_fields
            return author
        else:
            return None