コード例 #1
0
ファイル: USFXXMLBible.py プロジェクト: alerque/BibleOrgSys
 def loadTable( self, element, location ):
     """
     """
     BibleOrgSysGlobals.checkXMLNoText( element, location, 'kg92' )
     BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ka92' )
     BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'ks63' )
     for subelement in element:
         sublocation = subelement.tag + " of " + location
         if subelement.tag == 'tr':
             #print( "table", sublocation )
             self.thisBook.addLine( 'tr', '' )
             BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'sg32' )
             BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'dh82' )
             BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'mniq' )
             for sub2element in subelement:
                 sub2location = sub2element.tag + " of " + sublocation
                 tag, text = sub2element.tag, clean(sub2element.text)
                 assert( tag in ('th', 'thr', 'tc', 'tcr',) )
                 BibleOrgSysGlobals.checkXMLNoTail( sub2element, sub2location, 'ah82' )
                 BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'ka63' )
                 level = None
                 for attrib,value in sub2element.items():
                     if attrib == 'level': level = value
                     else:
                         logging.warning( _("vx25 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                 marker = tag + (level if level else '')
                 self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) )
         else:
             logging.warning( _("kv64 Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, self.thisBook.BBB, C, V, sublocation ) )
コード例 #2
0
    def __validateAndExtractChapter(self, BBB, thisBook, chapter):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3:
            print(_("Validating XML chapter..."))

        # Process the chapter attributes first
        chapterNumber = numVerses = None
        for attrib, value in chapter.items():
            if attrib == "cnumber":
                chapterNumber = value
            else:
                logging.warning(
                    "Unprocessed {!r} attribute ({}) in chapter element".
                    format(attrib, value))
        if chapterNumber:
            #print( BBB, 'c', chapterNumber )
            thisBook.addLine('c', chapterNumber)
        else:
            logging.error(
                "Missing 'n' attribute in chapter element for BBB".format(BBB))

        for element in chapter:
            if element.tag == ZefaniaXMLBible.verseTag:
                location = "verse in {} {}".format(BBB, chapterNumber)
                self.__validateAndExtractVerse(BBB, chapterNumber, thisBook,
                                               element)
            elif element.tag == ZefaniaXMLBible.captionTag:  # Used in Psalms
                location = "caption in {} {}".format(BBB, chapterNumber)
                BibleOrgSysGlobals.checkXMLNoTail(element, location, 'k5k8')
                BibleOrgSysGlobals.checkXMLNoSubelements(
                    element, location, 'd3f5')
                # Handle caption attributes
                vRef = None
                for attrib, value in element.items():
                    if attrib == "vref":
                        vRef = value
                        if BibleOrgSysGlobals.debugFlag: assert (vRef == '1')
                    else:
                        logging.warning(
                            "Unprocessed {!r} attribute ({}) in caption element"
                            .format(attrib, value))
                if BibleOrgSysGlobals.debugFlag: assert (vRef)
                vText = element.text
                if not vText:
                    logging.warning("{} {}:{} has no text".format(
                        BBB, chapterNumber, vRef))
                if vText:  # This is the main text of the caption
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    thisBook.addLine('v', '0' + ' ' +
                                     vText)  # We save it as verse zero
            else:
                logging.error("Expected to find {!r} but got {!r}".format(
                    ZefaniaXMLBible.verseTag, element.tag))
コード例 #3
0
ファイル: USFXXMLBible.py プロジェクト: alerque/BibleOrgSys
 def loadCrossreference( self, element, location ):
     """
     Has to handle: <x caller="+"><ref tgt="EXO.30.12">Exodus 30:12</ref></x>
     """
     text, tail = clean(element.text), clean(element.tail)
     caller = None
     for attrib,value in element.items():
         if attrib == 'caller':
             caller = value
         else:
             logging.warning( _("fhj2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
     self.thisBook.appendToLastLine( ' \\x {}'.format( caller ) )
     for subelement in element:
         sublocation = subelement.tag + " of " + location
         marker, xText, xTail = subelement.tag, clean(subelement.text), clean(subelement.tail)
         #print( "USFX.loadCrossreference", repr(caller), repr(text), repr(tail), repr(marker), repr(xText), repr(xTail) )
         #if BibleOrgSysGlobals.verbosityLevel > 0 and marker not in ('ref','xo','xt',):
             #print( "USFX.loadCrossreference found", repr(caller), repr(marker), repr(xText), repr(xTail) )
         if BibleOrgSysGlobals.debugFlag: assert( marker in ('ref','xo','xt',) )
         if marker=='ref':
             assert( xText )
             BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 's1sd' )
             target = None
             for attrib,value in subelement.items():
                 if attrib == 'tgt': target = value
                 else:
                     logging.warning( _("aj41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
             if target:
                 self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, xText ) )
             else: halt
         else:
             BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'sc35' )
             self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, xText ) )
             if marker[0] == 'x': # Starts with x, e.g., xo, xt
                 for sub2element in subelement:
                     sub2location = sub2element.tag + " of " + sublocation
                     marker2, xText2, xTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail)
                     BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fs63' )
                     if marker2=='ref':
                         if xText2:
                             #print( 'xt2', marker2, repr(xText2), repr(xTail2), sub2location )
                             self.thisBook.appendToLastLine( xText2 )
                         target = None
                         for attrib,value in sub2element.items():
                             if attrib == 'tgt': target = value
                             else:
                                 logging.warning( _("gs34 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) )
                         if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) )
                         else: halt
                     else: halt
                     if xTail2: self.thisBook.appendToLastLine( xTail2 )
             else: halt
         if xTail:
             self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, xTail ) )
     self.thisBook.appendToLastLine( '\\x*{}'.format( (' '+tail) if tail else '' ) )
コード例 #4
0
    def __validateAndExtractChapter( self, BBB, thisBook, chapter ):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter...") )

        # Process the div attributes first
        chapterNumber = numVerses = None
        for attrib,value in chapter.items():
            if attrib=="n":
                chapterNumber = value
            elif attrib=="VERSES":
                numVerses = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) )
        if chapterNumber:
            #print( BBB, 'c', chapterNumber )
            chapterNumber = chapterNumber.replace( 'of Solomon ', '' ) # Fix a mistake in the Chinese_SU module
            thisBook.addLine( 'c', chapterNumber )
        else: logging.error( "Missing 'n' attribute in chapter element for BBB".format( BBB ) )

        for element in chapter:
            if element.tag == OpenSongXMLBible.verseTag:
                sublocation = "verse in {} {}".format( BBB, chapterNumber )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'l5ks' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5f7h' )
                verseNumber = toVerseNumber = None
                for attrib,value in element.items():
                    if attrib=="n":
                        verseNumber = value
                    elif attrib=="t":
                        toVerseNumber = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert( verseNumber )
                #thisBook.addLine( 'v', verseNumber )
                vText = element.text
                if not vText:
                    logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) )
                if vText: # This is the main text of the verse (follows the verse milestone)
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    if '\n' in vText: # This is how they represent poety
                        #print( "vText", repr(vText), repr(element.text) )
                        for j, textBit in enumerate( vText.split( '\n' ) ):
                            if j==0:
                                thisBook.addLine( 'q1', '' )
                                thisBook.addLine( 'v', verseNumber + ' ' + textBit )
                            else: thisBook.addLine( 'q1', textBit )
                    else: # Just one verse line
                        thisBook.addLine( 'v', verseNumber + ' ' + vText )
            else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.verseTag, element.tag ) )
コード例 #5
0
ファイル: HaggaiXMLBible.py プロジェクト: gimapei/BibleOrgSys
    def __validateAndExtractParagraph(self, BBB, chapterNumber, thisBook,
                                      paragraph):
        """
        Check/validate and extract paragraph data from the given XML book record
            finding and saving paragraphs and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3:
            print(_("Validating XML paragraph…"))

        location = "paragraph in {} {}".format(BBB, chapterNumber)
        BibleOrgSysGlobals.checkXMLNoAttributes(paragraph, location, 'brgw3')
        BibleOrgSysGlobals.checkXMLNoText(paragraph, location, 'brgw3')
        BibleOrgSysGlobals.checkXMLNoTail(paragraph, location, 'brgw3')
        thisBook.addLine('p', '')

        # Handle verse subelements (verses)
        for element in paragraph:
            if element.tag == HaggaiXMLBible.verseTag:
                location = "verse in {} {}".format(BBB, chapterNumber)
                self.__validateAndExtractVerse(BBB, chapterNumber, thisBook,
                                               element)
            elif element.tag == HaggaiXMLBible.captionTag + 'disabled':  # Used in Psalms
                location = "caption in {} {}".format(BBB, chapterNumber)
                BibleOrgSysGlobals.checkXMLNoTail(element, location, 'k5k8')
                BibleOrgSysGlobals.checkXMLNoSubelements(
                    element, location, 'd3f5')
                # Handle caption attributes
                vRef = None
                for attrib, value in element.items():
                    if attrib == "vref":
                        vRef = value
                        if BibleOrgSysGlobals.debugFlag: assert vRef == '1'
                    else:
                        logging.warning(
                            "Unprocessed {!r} attribute ({}) in caption element"
                            .format(attrib, value))
                if BibleOrgSysGlobals.debugFlag: assert vRef
                vText = element.text
                if not vText:
                    logging.warning("{} {}:{} has no text".format(
                        BBB, chapterNumber, vRef))
                if vText:  # This is the main text of the caption
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    thisBook.addLine('v', '0' + ' ' +
                                     vText)  # We save it as verse zero
            else:
                logging.error("Expected to find {!r} but got {!r}".format(
                    HaggaiXMLBible.verseTag, element.tag))
コード例 #6
0
    def __validateAndExtractBook( self, book ):
        """
        Check/validate and extract book data from the given XML book record
            finding chapter subelements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML book…") )

        # Process the div attributes first
        BBB = bookName = bookShortName = bookNumber = None
        for attrib,value in book.items():
            if attrib=="bnumber":
                bookNumber = value
            elif attrib=="bname":
                bookName = value
            elif attrib=="bsname":
                bookShortName = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) )
        if bookNumber:
            try: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber )
            except KeyError:
                logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \
                                                                        .format( bookNumber, bookName, bookShortName ) )
        elif bookName:
            BBB = self.genericBOS.getBBBFromText( bookName )

        if BBB:
            if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}…").format( BBB, bookName ) )
            thisBook = BibleBook( self, BBB )
            thisBook.objectNameString = 'Haggai XML Bible Book object'
            thisBook.objectTypeString = 'Haggai'
            #thisBook.sourceFilepath = self.sourceFilepath
            for element in book:
                if element.tag == HaggaiXMLBible.captionTag:
                    sublocation = "caption in {}".format( BBB )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jhl6' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'jk21' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'kjh6' )
                    thisBook.addLine( 'mt', element.text )
                elif element.tag == HaggaiXMLBible.chapterTag:
                    sublocation = "chapter in {}".format( BBB )
                    BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                    self.__validateAndExtractChapter( BBB, thisBook, element )
                else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.chapterTag, element.tag ) )
            if BibleOrgSysGlobals.verbosityLevel > 2: print( "  Saving {} into results…".format( BBB ) )
            self.stashBook( thisBook )
コード例 #7
0
    def __validateAndExtractBook( self, book ):
        """
        Check/validate and extract book data from the given XML book record
            finding chapter subelements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML book…") )

        # Process the div attributes first
        BBB = bookName = bookShortName = bookNumber = None
        for attrib,value in book.items():
            if attrib=="bnumber":
                bookNumber = value
            elif attrib=="bname":
                bookName = value
            elif attrib=="bsname":
                bookShortName = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) )
        if bookNumber:
            try: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber )
            except KeyError:
                logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \
                                                                        .format( bookNumber, bookName, bookShortName ) )
        elif bookName:
            BBB = self.genericBOS.getBBBFromText( bookName )

        if BBB:
            if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}…").format( BBB, bookName ) )
            thisBook = BibleBook( self, BBB )
            thisBook.objectNameString = 'Haggai XML Bible Book object'
            thisBook.objectTypeString = 'Haggai'
            #thisBook.sourceFilepath = self.sourceFilepath
            for element in book:
                if element.tag == HaggaiXMLBible.captionTag:
                    sublocation = "caption in {}".format( BBB )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jhl6' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'jk21' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'kjh6' )
                    thisBook.addLine( 'mt', element.text )
                elif element.tag == HaggaiXMLBible.chapterTag:
                    sublocation = "chapter in {}".format( BBB )
                    BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                    self.__validateAndExtractChapter( BBB, thisBook, element )
                else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.chapterTag, element.tag ) )
            if BibleOrgSysGlobals.verbosityLevel > 2: print( "  Saving {} into results…".format( BBB ) )
            self.stashBook( thisBook )
コード例 #8
0
    def __validateAndExtractChapter( self, BBB, thisBook, chapter ):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter…") )

        # Process the chapter attributes first
        chapterNumber = numVerses = None
        for attrib,value in chapter.items():
            if attrib=="cnumber":
                chapterNumber = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) )
        if chapterNumber:
            #print( BBB, 'c', chapterNumber )
            thisBook.addLine( 'c', chapterNumber )
        else: logging.error( "Missing 'n' attribute in chapter element for {}".format( BBB ) )

        for element in chapter:
            if element.tag == HaggaiXMLBible.paragraphTag:
                location = "paragraph in {} {}".format( BBB, chapterNumber )
                self.__validateAndExtractParagraph( BBB, chapterNumber, thisBook, element )
            elif element.tag == HaggaiXMLBible.verseTag+'disabled':
                location = "verse in {} {}".format( BBB, chapterNumber )
                self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element )
            elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms
                location = "caption in {} {}".format( BBB, chapterNumber )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' )
                # Handle caption attributes
                vRef = None
                for attrib,value in element.items():
                    if attrib=="vref":
                        vRef = value
                        if BibleOrgSysGlobals.debugFlag: assert vRef == '1'
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert vRef
                vText = element.text
                if not vText:
                    logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) )
                if vText: # This is the main text of the caption
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero
            else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) )
コード例 #9
0
    def _validate( self ):
        """
        Check/validate the loaded data.
        """
        assert self._XMLtree

        uniqueDict = {}
        #for elementName in self._uniqueElements: uniqueDict["Element_"+elementName] = []
        for attributeName in self._uniqueAttributes: uniqueDict["Attribute_"+attributeName] = []

        for j,element in enumerate(self._XMLtree):
            if element.tag == self._mainElementTag:
                BibleOrgSysGlobals.checkXMLNoText( element, element.tag )
                BibleOrgSysGlobals.checkXMLNoTail( element, element.tag )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag )

                # Check compulsory attributes on this main element
                for attributeName in self._compulsoryAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is None:
                        logging.error( "Compulsory {!r} attribute is missing from {} element in record {}".format( attributeName, element.tag, j ) )
                    if not attributeValue and attributeName!="type":
                        logging.warning( "Compulsory {!r} attribute is blank on {} element in record {}".format( attributeName, element.tag, j ) )

                # Check optional attributes on this main element
                for attributeName in self._optionalAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None:
                        if not attributeValue:
                            logging.warning( "Optional {!r} attribute is blank on {} element in record {}".format( attributeName, element.tag, j ) )

                # Check for unexpected additional attributes on this main element
                for attributeName in element.keys():
                    attributeValue = element.get( attributeName )
                    if attributeName not in self._compulsoryAttributes and attributeName not in self._optionalAttributes:
                        logging.warning( "Additional {!r} attribute ({!r}) found on {} element in record {}".format( attributeName, attributeValue, element.tag, j ) )

                # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes)
                for attributeName in self._uniqueAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None and attributeName!="reference_name":
                        if attributeValue in uniqueDict["Attribute_"+attributeName]:
                            logging.error( "Found {!r} data repeated in {!r} field on {} element in record {}".format( attributeValue, attributeName, element.tag, j ) )
                        uniqueDict["Attribute_"+attributeName].append( attributeValue )
            else:
                logging.warning( "Unexpected element: {} in record {}".format( element.tag, j ) )
コード例 #10
0
ファイル: USFXXMLBible.py プロジェクト: alerque/BibleOrgSys
 def loadFigure( self, element, location ):
     """
     """
     BibleOrgSysGlobals.checkXMLNoText( element, location, 'ff36' )
     BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'cf35' )
     figDict = { 'description':'', 'catalog':'', 'size':'', 'location':'', 'copyright':'', 'caption':'', 'reference':'' }
     for subelement in element:
         sublocation = subelement.tag + " of " + location
         figTag, figText = subelement.tag, clean(subelement.text)
         assert( figTag in figDict )
         figDict[figTag] = '' if figText is None else figText
         BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'jkf5' )
         BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'ld18' )
         BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'hb46' )
     newString = ''
     for j,tag in enumerate( ('description', 'catalog', 'size', 'location', 'copyright', 'caption', 'reference',) ):
         newString += ('' if j==0 else '|') + figDict[tag]
     figTail = clean( element.tail )
     self.thisBook.appendToLastLine( ' \\fig {}\\fig*{}'.format( newString, (' '+figTail) if figTail else '' ) )
コード例 #11
0
ファイル: HaggaiXMLBible.py プロジェクト: alerque/BibleOrgSys
    def __validateAndExtractParagraph( self, BBB, chapterNumber, thisBook, paragraph ):
        """
        Check/validate and extract paragraph data from the given XML book record
            finding and saving paragraphs and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML paragraph...") )

        location = "paragraph in {} {}".format( BBB, chapterNumber )
        BibleOrgSysGlobals.checkXMLNoAttributes( paragraph, location, 'brgw3' )
        BibleOrgSysGlobals.checkXMLNoText( paragraph, location, 'brgw3' )
        BibleOrgSysGlobals.checkXMLNoTail( paragraph, location, 'brgw3' )
        thisBook.addLine( 'p', '' )

        # Handle verse subelements (verses)
        for element in paragraph:
            if element.tag == HaggaiXMLBible.verseTag:
                location = "verse in {} {}".format( BBB, chapterNumber )
                self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element )
            elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms
                location = "caption in {} {}".format( BBB, chapterNumber )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' )
                # Handle caption attributes
                vRef = None
                for attrib,value in element.items():
                    if attrib=="vref":
                        vRef = value
                        if BibleOrgSysGlobals.debugFlag: assert( vRef == '1' )
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert( vRef )
                vText = element.text
                if not vText:
                    logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) )
                if vText: # This is the main text of the caption
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero
            else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) )
コード例 #12
0
    def __validateAndExtractHeader( self ):
        """
        Extracts information out of the header record, such as:
            <INFORMATION>
            <title>King James Version</title>
            <creator></creator>
            <subject>The Holy Bible</subject>
            <description>In 1604, King James I of England authorized that a new translation of the Bible into English be started. It was finished in 1611, just 85 years after the first translation of the New Testament into English appeared (Tyndale, 1526). The Authorized Version, or King James Version, quickly became the standard for English-speaking Protestants. Its flowing language and prose rhythm has had a profound influence on the literature of the past 300 years.</description>
            <publisher>FREE BIBLE SOFTWARE GROUP</publisher>
            <contributors />
            <date>2009-01-23</date>
            <type>Bible</type>
            <format>Haggai XML Bible Markup Language</format>
            <identifier>kjv</identifier>
            <source>http://www.unboundbible.com/zips/index.cfm?lang=English</source>
            <language>ENG</language>
            <coverage>provide the Bible to the nations of the world</coverage>
            <rights>We believe that this Bible is found in the Public Domain.</rights>
        </INFORMATION>
        """
        if BibleOrgSysGlobals.debugFlag: assert self.header
        location = 'Header'
        BibleOrgSysGlobals.checkXMLNoAttributes( self.header, location, 'j4j6' )
        BibleOrgSysGlobals.checkXMLNoText( self.header, location, 'sk4l' )
        BibleOrgSysGlobals.checkXMLNoTail( self.header, location, 'a2d4' )

        # TODO: We probably need to rationalise some of the self.xxx stores
        for element in self.header:
            #print( "header", element.tag )
            if element.tag == 'title':
                sublocation = "title in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.title = element.text
            elif element.tag == 'creator':
                sublocation = "creator in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.creator = element.text
            elif element.tag == 'subject':
                sublocation = "subject in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.subject = element.text
            elif element.tag == 'description':
                sublocation = "description in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.description = element.text
            elif element.tag == 'publisher':
                sublocation = "publisher in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.publisher = element.text
            elif element.tag == 'contributor':
                sublocation = "contributor in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'alj1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jjd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5gk78' )
                if element.text:
                    try: self.contributor = [ self.contributor, element.text ] # Put multiples into a list
                    except AttributeError: self.contributor = element.text # Must be the first (and possibly only) one
            elif element.tag == 'contributors':
                sublocation = "contributors in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.contributors = element.text
            elif element.tag == 'date':
                sublocation = "date in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.date = element.text
            elif element.tag == 'type':
                sublocation = "type in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.documentType = element.text
            elif element.tag == 'format':
                sublocation = "format in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                if BibleOrgSysGlobals.debugFlag: assert element.text == 'Haggai XML Bible Markup Language'
            elif element.tag == 'identifier':
                sublocation = "identifier in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.identifier = element.text
            elif element.tag == 'source':
                sublocation = "source in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.source = element.text
            elif element.tag == 'language':
                sublocation = "language in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.language = element.text
            elif element.tag == 'coverage':
                sublocation = "coverage in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.coverage = element.text
            elif element.tag == 'rights':
                sublocation = "rights in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.rights = element.text
            else: logging.error( "Found unexpected {!r} tag in {}".format( element.tag, location ) )
コード例 #13
0
ファイル: USFXXMLBible.py プロジェクト: alerque/BibleOrgSys
    def loadParagraph( self, paragraphElement, paragraphLocation, BBB, C ):
        """
        Load the paragraph (p or q) container from the XML data file.
        """
        #if BibleOrgSysGlobals.verbosityLevel > 3:
            #print( _("USFXXMLBible.loadParagraph: Loading {} from {}...").format( self.name, self.sourceFolder ) )

        V = None
        pTag, pText = paragraphElement.tag, clean(paragraphElement.text)
        BibleOrgSysGlobals.checkXMLNoTail( paragraphElement, paragraphLocation, 'vsg7' )

        # Process the attributes first
        sfm = level = style = None
        for attrib,value in paragraphElement.items():
            if attrib == 'sfm': sfm = value
            elif attrib == 'level': level = value
            elif attrib == 'style': style = value
            else:
                logging.warning( "vfh4 Unprocessed {} attribute ({}) in {}".format( attrib, value, paragraphLocation ) )

        if sfm:
            assert( pTag == 'p' )
            pTag = sfm
        if level:
            #assert( pTag == 'q' ) # Could also be mt, etc.
            pTag += level
        if style:
            #print( repr(pTag), repr(pText), repr(style) )
            if BibleOrgSysGlobals.verbosityLevel > 2: print( "Ignoring {!r} style".format( style ) )

        self.thisBook.addLine( pTag, '' if pText is None else pText )

        for element in paragraphElement:
            location = element.tag + " of " + paragraphLocation
            #print( "element", repr(element.tag) )
            if element.tag == 'v': # verse milestone
                vTail = clean( element.tail ) # Main verse text
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'crc2' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'lct3' )
                lastV, V = V, None
                for attrib,value in element.items():
                    if attrib == 'id':
                        V = value
                    else:
                        logging.warning( _("cbs2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                assert( V is not None )
                assert( V )
                self.thisBook.addLine( 'v', V + ((' '+vTail) if vTail else '' ) )
            elif element.tag == 've': # verse end milestone -- we can just ignore this
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'lsc3' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'mfy4' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'bd24' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ks35' )
            elif element.tag == 'fig':
                self.loadFigure( element, location )
            elif element.tag == 'table':
                self.loadTable( element, location )
            elif element.tag == 'f':
                #print( "USFX.loadParagraph Found footnote at", paragraphLocation, C, V, repr(element.text) )
                self.loadFootnote( element, location, BBB, C, V )
            elif element.tag == 'x':
                #print( "USFX.loadParagraph Found xref at", paragraphLocation, C, V, repr(element.text) )
                self.loadCrossreference( element, location )
            elif element.tag in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting
                self.loadCharacterFormatting( element, location, BBB, C, V )
            elif element.tag == 'cs': # character style -- seems like a USFX hack
                text, tail = clean(element.text), clean(element.tail)
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kf92' )
                sfm = None
                for attrib,value in element.items():
                    if attrib == 'sfm': sfm = value
                    else:
                        logging.warning( _("sh29 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                if sfm not in ('w','ior',): print( "cs sfm got", repr(sfm) )
                self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( sfm, text, sfm, (' '+tail) if tail else '' ) )
            elif element.tag in ('cp',): # Simple single-line paragraph-level markers
                marker, text = element.tag, clean(element.text)
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'kdf0' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'lkj1' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'da13' )
                self.thisBook.addLine( marker, text )
            elif element.tag == 'ref': # encoded reference -- seems like a USFX hack
                text, tail = clean(element.text), clean(element.tail)
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'bd83' )
                target = None
                for attrib,value in element.items():
                    if attrib == 'tgt': target = value
                    else:
                        logging.warning( _("be83 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                #if target not in ('w','ior',): print( "ref sfm got", repr(sfm) )
                self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) )
                #print( "Saved", '\\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) )
            elif element.tag == 'optionalLineBreak':
                print( "What is loadParagraph optionalLineBreak?" )
                if BibleOrgSysGlobals.debugFlag: halt
            elif element.tag == 'milestone': # e.g., <milestone sfm="pb" attribute=""/> (pb = explicit page break)
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'jzx2' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ms23' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'dw24' )
                sfm = None
                for attrib,value in element.items():
                    if attrib == 'sfm': sfm = value
                    else:
                        logging.warning( _("mcd2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                if sfm not in ('pb',): print( "milestone sfm got", repr(sfm) )
                self.thisBook.addLine( sfm, '' )
            else:
                logging.warning( _("df45 Unprocessed {} element after {} {}:{} in {}").format( repr(element.tag), self.thisBook.BBB, C, V, location ) )
        return V
コード例 #14
0
    def __validateAndExtractVerse(self, BBB, chapterNumber, thisBook, verse):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3:
            print(_("Validating XML verse..."))

        location = "verse in {} {}".format(BBB, chapterNumber)
        BibleOrgSysGlobals.checkXMLNoTail(verse, location, 'l5ks')

        # Handle verse attributes
        verseNumber = toVerseNumber = None
        for attrib, value in verse.items():
            if attrib == "vnumber":
                verseNumber = value
            else:
                logging.warning(
                    "Unprocessed {!r} attribute ({}) in verse element".format(
                        attrib, value))
        if BibleOrgSysGlobals.debugFlag: assert (verseNumber)
        location = "{}:{}".format(
            location, verseNumber)  # Get a better location description
        #thisBook.addLine( 'v', verseNumber )
        vText = verse.text
        if vText: vText = vText.strip()
        #if not vText: # This happens if a verse starts immediately with a style or note
        #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) )

        # Handle verse subelements (notes and styled portions)
        for subelement in verse:
            if subelement.tag == ZefaniaXMLBible.noteTag:
                sublocation = "note in " + location
                noteType = None
                for attrib, value in subelement.items():
                    if attrib == "type":
                        noteType = value
                    else:
                        logging.warning(
                            "Unprocessed {!r} attribute ({}) in style subelement"
                            .format(attrib, value))
                if noteType not in (
                        'n-studynote',
                        'x-studynote',
                ):
                    logging.warning("Unexpected {} note type in {}".format(
                        noteType, BBB))
                if BibleOrgSysGlobals.debugFlag: assert (noteType)
                nText, nTail = subelement.text, subelement.tail
                #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) )
                #thisBook.addLine( 'ST', css ) # XXXXXXXXXXXXXXXXXXXXXXXXXX Losing data here (for now)
                #thisBook.addLine( 'ST=', nText )
                if nTail:
                    if '\n' in nTail:
                        print(
                            "ZefaniaXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}"
                            .format(BBB, chapterNumber, verseNumber, nTail))
                        nTail = nTail.replace('\n', ' ')
                    thisBook.addLine('v~', nTail)
                for subsubelement in subelement:
                    if subsubelement.tag == ZefaniaXMLBible.styleTag:
                        subsublocation = "style in " + sublocation
                        BibleOrgSysGlobals.checkXMLNoSubelements(
                            subsubelement, subsublocation, 'fyt4')
                        css = idStyle = None
                        for attrib, value in subsubelement.items():
                            if attrib == "css":
                                css = value
                            elif attrib == "id":
                                idStyle = value
                            else:
                                logging.warning(
                                    "Unprocessed {!r} attribute ({}) in style subsubelement"
                                    .format(attrib, value))
                        if BibleOrgSysGlobals.debugFlag:
                            assert (css or idStyle)
                        SFM = None
                        if css == "font-style:italic": SFM = '\\it'
                        elif css == "font-style:italic;font-weight:bold":
                            SFM = '\\bdit'
                        elif css == "color:#FF0000":
                            SFM = '\\em'
                        elif css == "font-size: x-small; color:#8B8378":
                            SFM = '\\add'
                        elif css is None and idStyle == 'cl:divineName':
                            SFM = '\\nd'
                        else:
                            print("css is", css, "idStyle is", idStyle)
                            halt
                        sText, sTail = subsubelement.text.strip(
                        ), subsubelement.tail
                        if BibleOrgSysGlobals.debugFlag: assert (sText)
                        if SFM: vText += SFM + ' ' + sText + SFM + '*'
                        else:
                            vText += '\\sc ' + '[' + css + ']' + sText + '\\sc* '  # Use sc for unknown styles
                        if sTail: vText += sTail.strip()
                    else:
                        logging.error(
                            "Expected to find {} but got {!r} in {}".format(
                                ZefaniaXMLBible.styleTag, subsubelement.tag,
                                sublocation))

            elif subelement.tag == ZefaniaXMLBible.styleTag:
                sublocation = "style in " + location
                BibleOrgSysGlobals.checkXMLNoSubelements(
                    subelement, sublocation, 'f5gh')
                css = idStyle = None
                for attrib, value in subelement.items():
                    if attrib == "css":
                        css = value
                    elif attrib == "id":
                        idStyle = value
                    else:
                        logging.warning(
                            "Unprocessed {!r} attribute ({}) in style subelement"
                            .format(attrib, value))
                if BibleOrgSysGlobals.debugFlag: assert (css or idStyle)
                SFM = None
                if css == "font-style:italic": SFM = '\\it'
                elif css == "font-style:italic;font-weight:bold":
                    SFM = '\\bdit'
                elif css == "color:#FF0000":
                    SFM = '\\em'
                elif css == "font-size: x-small; color:#8B8378":
                    SFM = '\\add'
                elif css is None and idStyle == 'cl:divineName':
                    SFM = '\\nd'
                else:
                    print("css is", css, "idStyle is", idStyle)
                    halt
                sText, sTail = subelement.text.strip(), subelement.tail
                if BibleOrgSysGlobals.debugFlag: assert (sText)
                if SFM: vText += SFM + ' ' + sText + SFM + '*'
                else:
                    vText += '\\sc ' + '[' + css + ']' + sText + '\\sc* '  # Use sc for unknown styles
                if sTail: vText += sTail.strip()

            elif subelement.tag == ZefaniaXMLBible.breakTag:
                sublocation = "line break in " + location
                BibleOrgSysGlobals.checkXMLNoText(subelement, sublocation,
                                                  'c1d4')
                BibleOrgSysGlobals.checkXMLNoSubelements(
                    subelement, sublocation, 'g4g8')
                art = None
                for attrib, value in subelement.items():
                    if attrib == "art":
                        art = value
                    else:
                        logging.warning(
                            "Unprocessed {!r} attribute ({}) in style subelement"
                            .format(attrib, value))
                if BibleOrgSysGlobals.debugFlag: assert (art == 'x-nl')
                #print( BBB, chapterNumber, verseNumber )
                #assert( vText )
                if vText:
                    thisBook.addLine('v', verseNumber + ' ' + vText)
                    vText = ''
                thisBook.addLine(
                    'm',
                    subelement.tail.strip() if subelement.tail else '')
                #bTail = subelement.tail
                #if bTail: vText = bTail.strip()
            else:
                logging.error(
                    "Expected to find NOTE or STYLE but got {!r} in {}".format(
                        subelement.tag, location))

        if vText:  # This is the main text of the verse (follows the verse milestone)
            if '\n' in vText:
                print(
                    "ZefaniaXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}"
                    .format(BBB, chapterNumber, verseNumber, vText))
                vText = vText.replace('\n', ' ')
            thisBook.addLine('v', verseNumber + ' ' + vText)
コード例 #15
0
    def _validateSystem( self, punctuationTree, systemName ):
        """
        """
        assert punctuationTree

        uniqueDict = {}
        for elementName in self.uniqueElements: uniqueDict["Element_"+elementName] = []
        for attributeName in self.uniqueAttributes: uniqueDict["Attribute_"+attributeName] = []

        for k,element in enumerate(punctuationTree):
            if element.tag in self.mainElementTags:
                BibleOrgSysGlobals.checkXMLNoTail( element, element.tag )
                if not self.compulsoryAttributes and not self.optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag )
                if not self.compulsoryElements and not self.optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag )

                # Check compulsory attributes on this main element
                for attributeName in self.compulsoryAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is None:
                        logging.error( _("Compulsory {!r} attribute is missing from {} element in record {}").format( attributeName, element.tag, k ) )
                    if not attributeValue:
                        logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, k ) )

                # Check optional attributes on this main element
                for attributeName in self.optionalAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None:
                        if not attributeValue:
                            logging.warning( _("Optional {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, k ) )

                # Check for unexpected additional attributes on this main element
                for attributeName in element.keys():
                    attributeValue = element.get( attributeName )
                    if attributeName not in self.compulsoryAttributes and attributeName not in self.optionalAttributes:
                        logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {}").format( attributeName, attributeValue, element.tag, k ) )

                # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes)
                for attributeName in self.uniqueAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None:
                        if attributeValue in uniqueDict["Attribute_"+attributeName]:
                            logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {}").format( attributeValue, attributeName, element.tag, k ) )
                        uniqueDict["Attribute_"+attributeName].append( attributeValue )

                # Check compulsory elements
                for elementName in self.compulsoryElements:
                    if element.find( elementName ) is None:
                        logging.error( _("Compulsory {!r} element is missing in record with ID {!r} (record {})").format( elementName, ID, k ) )
                    if not element.find( elementName ).text:
                        logging.warning( _("Compulsory {!r} element is blank in record with ID {!r} (record {})").format( elementName, ID, k ) )

                # Check optional elements
                for elementName in self.optionalElements:
                    if element.find( elementName ) is not None:
                        if not element.find( elementName ).text:
                            logging.warning( _("Optional {!r} element is blank in record with ID {!r} (record {})").format( elementName, ID, k ) )

                # Check for unexpected additional elements
                for subelement in element:
                    if subelement.tag not in self.compulsoryElements and subelement.tag not in self.optionalElements:
                        logging.warning( _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})").format( subelement.tag, subelement.text, ID, k ) )

                # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements)
                for elementName in self.uniqueElements:
                    if element.find( elementName ) is not None:
                        text = element.find( elementName ).text
                        if text in uniqueDict["Element_"+elementName]:
                            logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})").format( text, elementName, ID, k ) )
                        uniqueDict["Element_"+elementName].append( text )
            else:
                logging.warning( _("Unexpected element: {} in record {}").format( element.tag, k ) )
コード例 #16
0
    def __validateAndExtractHeader( self ):
        """
        Extracts information out of the header record, such as:
            <INFORMATION>
            <title>King James Version</title>
            <creator></creator>
            <subject>The Holy Bible</subject>
            <description>In 1604, King James I of England authorized that a new translation of the Bible into English be started. It was finished in 1611, just 85 years after the first translation of the New Testament into English appeared (Tyndale, 1526). The Authorized Version, or King James Version, quickly became the standard for English-speaking Protestants. Its flowing language and prose rhythm has had a profound influence on the literature of the past 300 years.</description>
            <publisher>FREE BIBLE SOFTWARE GROUP</publisher>
            <contributors />
            <date>2009-01-23</date>
            <type>Bible</type>
            <format>Haggai XML Bible Markup Language</format>
            <identifier>kjv</identifier>
            <source>http://www.unboundbible.com/zips/index.cfm?lang=English</source>
            <language>ENG</language>
            <coverage>provide the Bible to the nations of the world</coverage>
            <rights>We believe that this Bible is found in the Public Domain.</rights>
        </INFORMATION>
        """
        if BibleOrgSysGlobals.debugFlag: assert self.header
        location = 'Header'
        BibleOrgSysGlobals.checkXMLNoAttributes( self.header, location, 'j4j6' )
        BibleOrgSysGlobals.checkXMLNoText( self.header, location, 'sk4l' )
        BibleOrgSysGlobals.checkXMLNoTail( self.header, location, 'a2d4' )

        # TODO: We probably need to rationalise some of the self.xxx stores
        for element in self.header:
            #print( "header", element.tag )
            if element.tag == 'title':
                sublocation = "title in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.title = element.text
            elif element.tag == 'creator':
                sublocation = "creator in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.creator = element.text
            elif element.tag == 'subject':
                sublocation = "subject in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.subject = element.text
            elif element.tag == 'description':
                sublocation = "description in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.description = element.text
            elif element.tag == 'publisher':
                sublocation = "publisher in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.publisher = element.text
            elif element.tag == 'contributor':
                sublocation = "contributor in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'alj1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jjd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5gk78' )
                if element.text:
                    try: self.contributor = [ self.contributor, element.text ] # Put multiples into a list
                    except AttributeError: self.contributor = element.text # Must be the first (and possibly only) one
            elif element.tag == 'contributors':
                sublocation = "contributors in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.contributors = element.text
            elif element.tag == 'date':
                sublocation = "date in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.date = element.text
            elif element.tag == 'type':
                sublocation = "type in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.documentType = element.text
            elif element.tag == 'format':
                sublocation = "format in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                if BibleOrgSysGlobals.debugFlag: assert element.text == 'Haggai XML Bible Markup Language'
            elif element.tag == 'identifier':
                sublocation = "identifier in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.identifier = element.text
            elif element.tag == 'source':
                sublocation = "source in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.source = element.text
            elif element.tag == 'language':
                sublocation = "language in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if BibleOrgSysGlobals.debugFlag: assert element.text
                self.language = element.text
            elif element.tag == 'coverage':
                sublocation = "coverage in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.coverage = element.text
            elif element.tag == 'rights':
                sublocation = "rights in {}".format( location )
                BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' )
                if element.text: self.rights = element.text
            else: logging.error( "Found unexpected {!r} tag in {}".format( element.tag, location ) )
コード例 #17
0
    def importDataToPython( self ):
        """
        Loads (and pivots) the data (not including the header) into suitable Python containers to use in a Python program.
        (Of course, you can just use the elementTree in self._XMLtree if you prefer.)
        """
        def makeList( parameter1, parameter2 ):
            """
            Returns a list containing all parameters. Parameter1 may already be a list.
            """
            if isinstance( parameter1, list ):
                #assert parameter2 not in parameter1
                parameter1.append( parameter2 )
                return parameter1
            else:
                return [ parameter1, parameter2 ]
        # end of makeList


        assert self._XMLtree
        if self.__DataList: # We've already done an import/restructuring -- no need to repeat it
            return self.__DataList, self.__DataDict

        # We'll create a number of dictionaries with different elements as the key
        rawRefLinkList = []
        actualLinkCount = 0
        for element in self._XMLtree:
            #print( BibleOrgSysGlobals.elementStr( element ) )

            # Get these first for helpful error messages
            sourceReference = element.find('sourceReference').text
            sourceComponent = element.find('sourceComponent').text
            assert sourceComponent in ('Section','Verses','Verse',)

            BibleOrgSysGlobals.checkXMLNoText( element, sourceReference, 'kls1' )
            BibleOrgSysGlobals.checkXMLNoAttributes( element, sourceReference, 'kd21' )
            BibleOrgSysGlobals.checkXMLNoTail( element, sourceReference, 'so20' )

            actualRawLinksList = []
            for subelement in element:
                #print( BibleOrgSysGlobals.elementStr( subelement ) )
                if subelement.tag in ( 'sourceReference','sourceComponent',): # already processed these
                    BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sourceReference, 'ls12' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sourceReference, 'ks02' )
                    BibleOrgSysGlobals.checkXMLNoTail( subelement, sourceReference, 'sqw1' )

                elif subelement.tag == 'BibleReferenceLink':
                    BibleOrgSysGlobals.checkXMLNoText( subelement, sourceReference, 'haw9' )
                    BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sourceReference, 'hs19' )
                    BibleOrgSysGlobals.checkXMLNoTail( subelement, sourceReference, 'jsd9' )

                    targetReference = subelement.find('targetReference').text
                    targetComponent = subelement.find('targetComponent').text
                    assert targetComponent in ('Section','Verses','Verse',)
                    linkType = subelement.find('linkType').text
                    assert linkType in ('TSK','QuotedOTReference','AlludedOTReference','PossibleOTReference',)

                    actualRawLinksList.append( (targetReference,targetComponent,linkType,) )
                    actualLinkCount += 1

            rawRefLinkList.append( (sourceReference,sourceComponent,actualRawLinksList,) )

        if BibleOrgSysGlobals.verbosityLevel > 1:
            print( "  {} raw links loaded (with {} actual raw link entries)".format( len(rawRefLinkList), actualLinkCount ) )


        myRefLinkList = []
        actualLinkCount = 0
        BOS = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' )

        for j,(sourceReference,sourceComponent,actualRawLinksList) in enumerate( rawRefLinkList ):
            # Just do some testing first
            if sourceComponent == 'Verse':
                x = SimpleVerseKey( sourceReference )
            else:
                flag = False
                try:
                    x = SimpleVerseKey( sourceReference, ignoreParseErrors=True )
                    flag = True
                except TypeError: pass # This should happen coz it should fail the SVK
                if flag:
                    logging.error( "{} {!r} failed!".format( sourceComponent, sourceReference ) )
                    raise TypeError
            # Now do the actual parsing
            parsedSourceReference = FlexibleVersesKey( sourceReference )
            if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
                print( j, sourceComponent, sourceReference, parsedSourceReference )
                #assert parsedSourceReference.getShortText().replace(' ','_') == sourceReference
            actualLinksList = []
            for k,(targetReference,targetComponent,linkType) in enumerate( actualRawLinksList ):
                # Just do some testing first
                if targetComponent == 'Verse':
                    x = SimpleVerseKey( targetReference )
                else:
                    flag = False
                    try:
                        x = SimpleVerseKey( targetReference, ignoreParseErrors=True )
                        flag = True
                    except TypeError: pass # This should happen coz it should fail the SVK
                    if flag:
                        logging.error( "{} {!r} failed!".format( targetComponent, targetReference ) )
                        raise TypeError
                # Now do the actual parsing
                try: parsedTargetReference = FlexibleVersesKey( targetReference )
                except TypeError:
                    print( "  Temporarily ignored {!r} (TypeError from FlexibleVersesKey)".format( targetReference ) )
                    parsedTargetReference = None
                if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
                    print( ' ', targetComponent, targetReference, parsedTargetReference )
                    #assert parsedTargetReference.getShortText().replace(' ','_',1) == targetReference

                actualLinksList.append( (targetReference,targetComponent,parsedTargetReference,linkType,) )
                actualLinkCount += 1

            myRefLinkList.append( (sourceReference,sourceComponent,parsedSourceReference,actualLinksList,) )

        if BibleOrgSysGlobals.verbosityLevel > 1:
            print( "  {} links processed (with {} actual link entries)".format( len(rawRefLinkList), actualLinkCount ) )
        #print( myRefLinkList ); halt
        self.__DataList = myRefLinkList

        # Now put it into my dictionaries for easy access
        # This part should be customized or added to for however you need to process the data

        # Create a link dictionary (by verse key)
        myRefLinkDict = {}
        for sourceReference,sourceComponent,parsedSourceReference,actualLinksList in myRefLinkList:
            #print( sourceReference, sourceComponent, parsedSourceReference )
            #print( sourceReference, sourceComponent, parsedSourceReference, actualLinksList )
            for verseRef in parsedSourceReference.getIncludedVerses():
                #print( verseRef )
                assert isinstance( verseRef, SimpleVerseKey )
                if verseRef not in myRefLinkDict: myRefLinkDict[verseRef] = []
                myRefLinkDict[verseRef].append( (sourceReference,sourceComponent,parsedSourceReference,actualLinksList,) )
            #print( myRefLinkDict ); halt
        originalLinks = len( myRefLinkDict )
        print( "  {} verse links added to dictionary (includes filling out spans)".format( originalLinks ) )
        #print( myRefLinkDict ); halt

        # Create a reversed link dictionary (by verse key)
        for sourceReference,sourceComponent,parsedSourceReference,actualLinksList in myRefLinkList:
            #print( sourceReference, sourceComponent, parsedSourceReference )
            #print( sourceReference, sourceComponent, parsedSourceReference, actualLinksList )
            for targetReference,targetComponent,parsedTargetReference,linkType in actualLinksList:
                if parsedTargetReference is not None:
                    for verseRef in parsedTargetReference.getIncludedVerses():
                        #print( verseRef )
                        assert isinstance( verseRef, SimpleVerseKey )
                        if linkType == 'TSK': reverseLinkType = 'TSKQuoted'
                        elif linkType == 'QuotedOTReference': reverseLinkType = 'OTReferenceQuoted'
                        elif linkType == 'AlludedOTReference': reverseLinkType = 'OTReferenceAlluded'
                        elif linkType == 'PossibleOTReference': reverseLinkType = 'OTReferencePossible'
                        else: halt # Have a new linkType!
                        if verseRef not in myRefLinkDict: myRefLinkDict[verseRef] = []
                        myRefLinkDict[verseRef].append( (targetReference,targetComponent,parsedTargetReference,[(sourceReference,sourceComponent,parsedSourceReference,reverseLinkType)]) )
            #print( myRefLinkDict ); halt
        totalLinks = len( myRefLinkDict )
        reverseLinks = totalLinks - originalLinks
        print( "  {} reverse links added to dictionary to give {} total".format( reverseLinks, totalLinks ) )
        #print( myRefLinkDict ); halt

        self.__DataDict = myRefLinkDict

        # Let's find the most number of references for a verse
        mostReferences = totalReferences = 0
        for verseRef, entryList in self.__DataDict.items():
            numRefs = len( entryList )
            if numRefs > mostReferences: mostReferences, mostVerseRef = numRefs, verseRef
            totalReferences += numRefs
        print( "  {} maximum links for any one reference ({})".format( mostReferences, mostVerseRef.getShortText() ) )
        print( "  {} total links for all references".format( totalReferences ) )

        return self.__DataList, self.__DataDict
コード例 #18
0
    def __validateAndExtractChapter(self, BBB, thisBook, chapter):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3:
            print(_("Validating XML chapter…"))

        # Process the div attributes first
        chapterNumber = numVerses = None
        for attrib, value in chapter.items():
            if attrib == "n":
                chapterNumber = value
            elif attrib == "VERSES":
                numVerses = value
            else:
                logging.warning(
                    "Unprocessed {!r} attribute ({}) in chapter element".
                    format(attrib, value))
        if chapterNumber:
            #print( BBB, 'c', chapterNumber )
            chapterNumber = chapterNumber.replace(
                'of Solomon ', '')  # Fix a mistake in the Chinese_SU module
            thisBook.addLine('c', chapterNumber)
        else:
            logging.error(
                "Missing 'n' attribute in chapter element for {}".format(BBB))

        for element in chapter:
            if element.tag == OpenSongXMLBible.verseTag:
                sublocation = "verse in {} {}".format(BBB, chapterNumber)
                BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'l5ks')
                verseNumber = toVerseNumber = None
                for attrib, value in element.items():
                    if attrib == "n":
                        verseNumber = value
                    elif attrib == "t":
                        toVerseNumber = value
                    else:
                        logging.warning(
                            "Unprocessed {!r} attribute ({}) in verse element".
                            format(attrib, value))
                if BibleOrgSysGlobals.debugFlag: assert verseNumber
                #thisBook.addLine( 'v', verseNumber )
                vText = element.text if element.text else ''
                for subelement in element:
                    sub2location = "{} in {}".format(subelement.tag,
                                                     sublocation)
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        subelement, sub2location, 'ks03')
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        subelement, sub2location, 'ks05')
                    if subelement.tag == 'i':
                        vText += '\\it {}\\it*{}'.format(
                            subelement.text, subelement.tail)
                    else:
                        logging.error(
                            "Expected to find 'i' but got {!r}".format(
                                subelement.tag))
                vText += element.tail if element.tail else ''
                if not vText:
                    logging.warning("{} {}:{} has no text".format(
                        BBB, chapterNumber, verseNumber))
                #print( 'vText1', vText )
                if vText:  # This is the main text of the verse (follows the verse milestone)
                    #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                    if '\n' in vText:  # This is how they represent poety
                        #print( "vText", repr(vText), repr(element.text) )
                        for j, textBit in enumerate(vText.split('\n')):
                            if j == 0:
                                thisBook.addLine('q1', '')
                                thisBook.addLine('v',
                                                 verseNumber + ' ' + textBit)
                            else:
                                thisBook.addLine('q1', textBit)
                    else:  # Just one verse line
                        thisBook.addLine('v', verseNumber + ' ' + vText)
                #print( 'vText2', vText )
            else:
                logging.error("Expected to find {!r} but got {!r}".format(
                    OpenSongXMLBible.verseTag, element.tag))
コード例 #19
0
        def loadParagraph( paragraphXML, paragraphlocation ):
            """ Load a paragraph from the USX XML.
                Uses (and updates) c,v information from the containing function. """
            nonlocal c, v

            # Process the attributes first
            paragraphStyle = None
            for attrib,value in paragraphXML.items():
                if attrib=='style':
                    paragraphStyle = value # This is basically the USFM marker name
                else:
                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )

            # Now process the paragraph text (or write a paragraph marker anyway)
            self.addLine( paragraphStyle, paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else '' )

            # Now process the paragraph subelements
            for element in paragraphXML:
                location = element.tag + ' ' + paragraphlocation
                #print( "USXXMLBibleBook.load", c, v, element.tag, location )
                if element.tag == 'verse': # milestone (not a container)
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    verseStyle = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            v = value
                        elif attrib=='style':
                            verseStyle = value
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    if verseStyle != 'v':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) )
                    self.addLine( verseStyle, v + ' ' )
                    # Now process the tail (if there's one) which is the verse text
                    if element.tail:
                        vText = element.tail.strip()
                        if vText:
                            #print( repr(vText) )
                            self.appendToLastLine( vText )
                elif element.tag == 'char':
                    # Process the attributes first
                    charStyle = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            charStyle = value # This is basically the USFM character marker name
                            #print( "  charStyle", charStyle )
                            assert( not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle ) )
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    charLine = "\\{} {} ".format( charStyle, element.text )
                    # Now process the subelements -- chars are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( c, v, element.tag )
                        if subelement.tag == 'char': # milestone (not a container)
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            subCharStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style': subCharStyle = value
                                elif attrib=='closed':
                                    assert( value=='false' )
                                    charClosed = False
                                else:
                                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            charLine += "\\{} {}".format( subCharStyle, subelement.text )
                            if charClosed: charLine += "\\{}*".format( subCharStyle )
                            charLine += '' if subelement.tail is None else subelement.tail.strip()
                        else:
                            logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, c, v, sublocation ) )
                            self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) )
                    # A character field must be added to the previous field
                    charLine += "\\{}*{}".format( charStyle, '' if element.tail is None else element.tail.strip() )
                    if debuggingThisModule: print( "USX.loadParagraph:", c, v, paragraphStyle, charStyle, repr(charLine) )
                    self.appendToLastLine( charLine )
                elif element.tag == 'note':
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    # Process the attributes first
                    noteStyle = noteCaller = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            noteStyle = value # This is basically the USFM marker name
                            assert( noteStyle in ('x','f',) )
                        elif attrib=='caller':
                            noteCaller = value # Usually hyphen or a symbol to be used for the note
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    assert( noteStyle and noteCaller ) # both compulsory
                    noteLine = "\\{} {} ".format( noteStyle, noteCaller )
                    # Now process the subelements -- notes are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( c, v, element.tag )
                        if subelement.tag == 'char': # milestone (not a container)
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            charStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style':
                                    charStyle = value
                                elif attrib=='closed':
                                    assert( value=='false' )
                                    charClosed = False
                                else:
                                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            noteLine += "\\{} {}".format( charStyle, subelement.text )
                            if charClosed: noteLine += "\\{}*".format( charStyle )
                            noteLine += '' if subelement.tail is None else subelement.tail.strip()
                        elif subelement.tag == 'unmatched': # Used to denote errors in the source text
                            BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation )
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            unmmatchedMarker = None
                            for attrib,value in subelement.items():
                                if attrib=='marker':
                                    unmmatchedMarker = value
                                else:
                                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            self.addPriorityError( 2, c, v, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) )
                        else:
                            logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, c, v, sublocation ) )
                            self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) )
                    if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail
                    #noteLine += "\\{}*".format( charStyle )
                    noteLine += "\\{}*".format( noteStyle )
                    if element.tail:
                        noteText = element.tail.strip()
                        noteLine += noteText
                    self.appendToLastLine( noteLine )
                elif element.tag == 'link': # Used to include extra resources
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    linkStyle = linkDisplay = linkTarget = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            linkStyle = value
                            assert( linkStyle in ('jmp',) )
                        elif attrib=='display':
                            linkDisplay = value # e.g., "click here"
                        elif attrib=='target':
                            linkTarget = value # e.g., some reference
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    self.addPriorityError( 3, c, v, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) )
                elif element.tag == 'unmatched': # Used to denote errors in the source text
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    self.addPriorityError( 2, c, v, _("Unmatched element in {}").format( location) )
                else:
                    logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, c, v, location ) )
                    self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )
                    for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] )
                    if BibleOrgSysGlobals.debugFlag: halt
コード例 #20
0
    def load( self, filename, folder=None, encoding='utf-8' ):
        """
        Load a single source USX XML file and extract the information.
        """

        def loadParagraph( paragraphXML, paragraphlocation ):
            """ Load a paragraph from the USX XML.
                Uses (and updates) c,v information from the containing function. """
            nonlocal c, v

            # Process the attributes first
            paragraphStyle = None
            for attrib,value in paragraphXML.items():
                if attrib=='style':
                    paragraphStyle = value # This is basically the USFM marker name
                else:
                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )

            # Now process the paragraph text (or write a paragraph marker anyway)
            self.addLine( paragraphStyle, paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else '' )

            # Now process the paragraph subelements
            for element in paragraphXML:
                location = element.tag + ' ' + paragraphlocation
                #print( "USXXMLBibleBook.load", c, v, element.tag, location )
                if element.tag == 'verse': # milestone (not a container)
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    verseStyle = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            v = value
                        elif attrib=='style':
                            verseStyle = value
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    if verseStyle != 'v':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) )
                    self.addLine( verseStyle, v + ' ' )
                    # Now process the tail (if there's one) which is the verse text
                    if element.tail:
                        vText = element.tail.strip()
                        if vText:
                            #print( repr(vText) )
                            self.appendToLastLine( vText )
                elif element.tag == 'char':
                    # Process the attributes first
                    charStyle = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            charStyle = value # This is basically the USFM character marker name
                            #print( "  charStyle", charStyle )
                            assert( not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle ) )
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    charLine = "\\{} {} ".format( charStyle, element.text )
                    # Now process the subelements -- chars are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( c, v, element.tag )
                        if subelement.tag == 'char': # milestone (not a container)
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            subCharStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style': subCharStyle = value
                                elif attrib=='closed':
                                    assert( value=='false' )
                                    charClosed = False
                                else:
                                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            charLine += "\\{} {}".format( subCharStyle, subelement.text )
                            if charClosed: charLine += "\\{}*".format( subCharStyle )
                            charLine += '' if subelement.tail is None else subelement.tail.strip()
                        else:
                            logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, c, v, sublocation ) )
                            self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) )
                    # A character field must be added to the previous field
                    charLine += "\\{}*{}".format( charStyle, '' if element.tail is None else element.tail.strip() )
                    if debuggingThisModule: print( "USX.loadParagraph:", c, v, paragraphStyle, charStyle, repr(charLine) )
                    self.appendToLastLine( charLine )
                elif element.tag == 'note':
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    # Process the attributes first
                    noteStyle = noteCaller = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            noteStyle = value # This is basically the USFM marker name
                            assert( noteStyle in ('x','f',) )
                        elif attrib=='caller':
                            noteCaller = value # Usually hyphen or a symbol to be used for the note
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    assert( noteStyle and noteCaller ) # both compulsory
                    noteLine = "\\{} {} ".format( noteStyle, noteCaller )
                    # Now process the subelements -- notes are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( c, v, element.tag )
                        if subelement.tag == 'char': # milestone (not a container)
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            charStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style':
                                    charStyle = value
                                elif attrib=='closed':
                                    assert( value=='false' )
                                    charClosed = False
                                else:
                                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            noteLine += "\\{} {}".format( charStyle, subelement.text )
                            if charClosed: noteLine += "\\{}*".format( charStyle )
                            noteLine += '' if subelement.tail is None else subelement.tail.strip()
                        elif subelement.tag == 'unmatched': # Used to denote errors in the source text
                            BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation )
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            unmmatchedMarker = None
                            for attrib,value in subelement.items():
                                if attrib=='marker':
                                    unmmatchedMarker = value
                                else:
                                    logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            self.addPriorityError( 2, c, v, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) )
                        else:
                            logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, c, v, sublocation ) )
                            self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) )
                    if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail
                    #noteLine += "\\{}*".format( charStyle )
                    noteLine += "\\{}*".format( noteStyle )
                    if element.tail:
                        noteText = element.tail.strip()
                        noteLine += noteText
                    self.appendToLastLine( noteLine )
                elif element.tag == 'link': # Used to include extra resources
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    linkStyle = linkDisplay = linkTarget = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            linkStyle = value
                            assert( linkStyle in ('jmp',) )
                        elif attrib=='display':
                            linkDisplay = value # e.g., "click here"
                        elif attrib=='target':
                            linkTarget = value # e.g., some reference
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    self.addPriorityError( 3, c, v, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) )
                elif element.tag == 'unmatched': # Used to denote errors in the source text
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    self.addPriorityError( 2, c, v, _("Unmatched element in {}").format( location) )
                else:
                    logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, c, v, location ) )
                    self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )
                    for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] )
                    if BibleOrgSysGlobals.debugFlag: halt
        # end of loadParagraph

        if BibleOrgSysGlobals.verbosityLevel > 2: print( "  " + _("Loading {}...").format( filename ) )
        self.isOneChapterBook = self.BBB in BibleOrgSysGlobals.BibleBooksCodes.getSingleChapterBooksList()
        self.sourceFilename = filename
        self.sourceFolder = folder
        self.sourceFilepath = os.path.join( folder, filename ) if folder else filename
        self.tree = ElementTree().parse( self.sourceFilepath )
        assert( len ( self.tree ) ) # Fail here if we didn't load anything at all

        c = v = '0'
        loadErrors = []
        lastMarker = None

        # Find the main container
        if self.tree.tag=='usx' or self.tree.tag=='usfm': # Not sure why both are allowable
            location = "USX ({}) file".format( self.tree.tag )
            BibleOrgSysGlobals.checkXMLNoText( self.tree, location )
            BibleOrgSysGlobals.checkXMLNoTail( self.tree, location )

            # Process the attributes first
            self.schemaLocation = ''
            version = None
            for attrib,value in self.tree.items():
                if attrib=='version': version = value
                logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
            if version not in ( None, '2.0' ):
                logging.warning( _("Not sure if we can handle v{} USX files").format( version ) )

            # Now process the data
            for element in self.tree:
                sublocation = element.tag + " " + location
                if element.tag == 'book': # milestone (not a container)
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation )
                    # Process the attributes
                    idField = bookStyle = None
                    for attrib,value in element.items():
                        if attrib=='id' or attrib=='code':
                            idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode)
                            #if idField != BBB:
                            #    logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) )
                        elif attrib=='style':
                            bookStyle = value
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                    if bookStyle != 'id':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( bookStyle, sublocation ) )
                    idLine = idField
                    if element.text and element.text.strip(): idLine += ' ' + element.text
                    self.addLine( 'id', idLine )
                elif element.tag == 'chapter': # milestone (not a container)
                    v = '0'
                    BibleOrgSysGlobals.checkXMLNoText( element, sublocation )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation )
                    # Process the attributes
                    chapterStyle = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            c = value
                        elif attrib=='style':
                            chapterStyle = value
                        else:
                            logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                    if chapterStyle != 'c':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( chapterStyle, sublocation ) )
                    self.addLine( 'c', c )
                elif element.tag == 'para':
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation )
                    USFMMarker = element.attrib['style'] # Get the USFM code for the paragraph style
                    if BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( USFMMarker ):
                        #if lastMarker: self.addLine( lastMarker, lastText )
                        #lastMarker, lastText = USFMMarker, text
                        loadParagraph( element, sublocation )
                    elif BibleOrgSysGlobals.USFMMarkers.isInternalMarker( USFMMarker ): # the line begins with an internal USFM Marker -- append it to the previous line
                        text = element.text
                        if text is None: text = ''
                        if BibleOrgSysGlobals.debugFlag:
                            print( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, c, v, USFMMarker, text ) )
                            #halt # Not checked yet
                        if text:
                            loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, c, v, USFMMarker, text ) )
                            logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, c, v, text ) )
                        else: # no text
                            loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM Marker at beginning of line (with no text)").format( self.BBB, c, v, USFMMarker ) )
                            logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, c, v ) )
                        self.addPriorityError( 97, c, v, _("Found \\{} internal USFM Marker on new line in file").format( USFMMarker ) )
                        #lastText += '' if lastText.endswith(' ') else ' ' # Not always good to add a space, but it's their fault!
                        lastText =  '\\' + USFMMarker + ' ' + text
                        #print( "{} {} {} Now have {}:{!r}".format( self.BBB, c, v, lastMarker, lastText ) )
                    else: # the line begins with an unknown USFM Marker
                        text = element.text
                        if text:
                            loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line with text: {}").format( self.BBB, c, v, USFMMarker, text ) )
                            logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, c, v, text ) )
                        else: # no text
                            loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line (with no text").format( self.BBB, c, v, USFMMarker ) )
                            logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, c, v ) )
                        self.addPriorityError( 100, c, v, _("Found \\{} unknown USFM Marker on new line in file").format( USFMMarker ) )
                        for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space
                            if USFMMarker.startswith( tryMarker ): # Let's try changing it
                                if lastMarker: self.addLine( lastMarker, lastText )
                                lastMarker, lastText = tryMarker, USFMMarker[len(tryMarker):] + ' ' + text
                                loadErrors.append( _("{} {}:{} Changed '\\{}' unknown USFM Marker to {!r} at beginning of line: {}").format( self.BBB, c, v, USFMMarker, tryMarker, text ) )
                                logging.warning( _("Changed '\\{}' unknown USFM Marker to {!r} after {} {}:{} at beginning of line: {}").format( USFMMarker, tryMarker, self.BBB, c, v, text ) )
                                break
                        # Otherwise, don't bother processing this line -- it'll just cause more problems later on
                else:
                    logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, c, v, sublocation ) )
                    self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )

        if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
コード例 #21
0
    def __validate( self ):
        """
        Check/validate the loaded data.
        """
        assert self._XMLtree

        uniqueDict = {}
        for elementName in self._uniqueElements: uniqueDict["Element_"+elementName] = []
        for attributeName in self._uniqueAttributes: uniqueDict["Attribute_"+attributeName] = []

        expectedID = 1
        for j,element in enumerate(self._XMLtree):
            if element.tag == self._mainElementTag:
                BibleOrgSysGlobals.checkXMLNoText( element, element.tag )
                BibleOrgSysGlobals.checkXMLNoTail( element, element.tag )
                if not self._compulsoryAttributes and not self._optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag )
                if not self._compulsoryElements and not self._optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag )

                # Check compulsory attributes on this main element
                for attributeName in self._compulsoryAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is None:
                        logging.error( _("Compulsory {!r} attribute is missing from {} element in record {}").format( attributeName, element.tag, j ) )
                    if not attributeValue:
                        logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, j ) )

                # Check optional attributes on this main element
                for attributeName in self._optionalAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None:
                        if not attributeValue:
                            logging.warning( _("Optional {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, j ) )

                # Check for unexpected additional attributes on this main element
                for attributeName in element.keys():
                    attributeValue = element.get( attributeName )
                    if attributeName not in self._compulsoryAttributes and attributeName not in self._optionalAttributes:
                        logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {}").format( attributeName, attributeValue, element.tag, j ) )

                # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes)
                for attributeName in self._uniqueAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None:
                        if attributeValue in uniqueDict["Attribute_"+attributeName]:
                            logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {}").format( attributeValue, attributeName, element.tag, j ) )
                        uniqueDict["Attribute_"+attributeName].append( attributeValue )

                # Get the marker to use as a record ID
                marker = element.find("marker").text

                # Check compulsory elements
                for elementName in self._compulsoryElements:
                    if element.find( elementName ) is None:
                        logging.error( _("Compulsory {!r} element is missing in record with marker {!r} (record {})").format( elementName, marker, j ) )
                    elif not element.find( elementName ).text:
                        logging.warning( _("Compulsory {!r} element is blank in record with marker {!r} (record {})").format( elementName, marker, j ) )

                # Check optional elements
                for elementName in self._optionalElements:
                    if element.find( elementName ) is not None:
                        if not element.find( elementName ).text:
                            logging.warning( _("Optional {!r} element is blank in record with marker {!r} (record {})").format( elementName, marker, j ) )

                # Check for unexpected additional elements
                for subelement in element:
                    if subelement.tag not in self._compulsoryElements and subelement.tag not in self._optionalElements:
                        logging.warning( _("Additional {!r} element ({!r}) found in record with marker {!r} (record {})").format( subelement.tag, subelement.text, marker, j ) )

                # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements)
                for elementName in self._uniqueElements:
                    if element.find( elementName ) is not None:
                        text = element.find( elementName ).text
                        if text in uniqueDict["Element_"+elementName]:
                            logging.error( _("Found {!r} data repeated in {!r} element in record with marker {!r} (record {})").format( text, elementName, marker, j ) )
                        uniqueDict["Element_"+elementName].append( text )
            else:
                logging.warning( _("Unexpected element: {} in record {}").format( element.tag, j ) )
            if element.tail is not None and element.tail.strip(): logging.error( _("Unexpected {!r} tail data after {} element in record {}").format( element.tail, element.tag, j ) )
        if self._XMLtree.tail is not None and self._XMLtree.tail.strip(): logging.error( _("Unexpected {!r} tail data after {} element").format( self._XMLtree.tail, self._XMLtree.tag ) )
コード例 #22
0
    def _validateSystem(self, punctuationTree, systemName):
        """
        """
        assert punctuationTree

        uniqueDict = {}
        for elementName in self.uniqueElements:
            uniqueDict["Element_" + elementName] = []
        for attributeName in self.uniqueAttributes:
            uniqueDict["Attribute_" + attributeName] = []

        for k, element in enumerate(punctuationTree):
            if element.tag in self.mainElementTags:
                BibleOrgSysGlobals.checkXMLNoTail(element, element.tag)
                if not self.compulsoryAttributes and not self.optionalAttributes:
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, element.tag)
                if not self.compulsoryElements and not self.optionalElements:
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, element.tag)

                # Check compulsory attributes on this main element
                for attributeName in self.compulsoryAttributes:
                    attributeValue = element.get(attributeName)
                    if attributeValue is None:
                        logging.error(
                            _("Compulsory {!r} attribute is missing from {} element in record {}"
                              ).format(attributeName, element.tag, k))
                    if not attributeValue:
                        logging.warning(
                            _("Compulsory {!r} attribute is blank on {} element in record {}"
                              ).format(attributeName, element.tag, k))

                # Check optional attributes on this main element
                for attributeName in self.optionalAttributes:
                    attributeValue = element.get(attributeName)
                    if attributeValue is not None:
                        if not attributeValue:
                            logging.warning(
                                _("Optional {!r} attribute is blank on {} element in record {}"
                                  ).format(attributeName, element.tag, k))

                # Check for unexpected additional attributes on this main element
                for attributeName in element.keys():
                    attributeValue = element.get(attributeName)
                    if attributeName not in self.compulsoryAttributes and attributeName not in self.optionalAttributes:
                        logging.warning(
                            _("Additional {!r} attribute ({!r}) found on {} element in record {}"
                              ).format(attributeName, attributeValue,
                                       element.tag, k))

                # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes)
                for attributeName in self.uniqueAttributes:
                    attributeValue = element.get(attributeName)
                    if attributeValue is not None:
                        if attributeValue in uniqueDict["Attribute_" +
                                                        attributeName]:
                            logging.error(
                                _("Found {!r} data repeated in {!r} field on {} element in record {}"
                                  ).format(attributeValue, attributeName,
                                           element.tag, k))
                        uniqueDict["Attribute_" +
                                   attributeName].append(attributeValue)

                # Check compulsory elements
                for elementName in self.compulsoryElements:
                    if element.find(elementName) is None:
                        logging.error(
                            _("Compulsory {!r} element is missing in record with ID {!r} (record {})"
                              ).format(elementName, ID, k))
                    if not element.find(elementName).text:
                        logging.warning(
                            _("Compulsory {!r} element is blank in record with ID {!r} (record {})"
                              ).format(elementName, ID, k))

                # Check optional elements
                for elementName in self.optionalElements:
                    if element.find(elementName) is not None:
                        if not element.find(elementName).text:
                            logging.warning(
                                _("Optional {!r} element is blank in record with ID {!r} (record {})"
                                  ).format(elementName, ID, k))

                # Check for unexpected additional elements
                for subelement in element:
                    if subelement.tag not in self.compulsoryElements and subelement.tag not in self.optionalElements:
                        logging.warning(
                            _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})"
                              ).format(subelement.tag, subelement.text, ID, k))

                # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements)
                for elementName in self.uniqueElements:
                    if element.find(elementName) is not None:
                        text = element.find(elementName).text
                        if text in uniqueDict["Element_" + elementName]:
                            logging.error(
                                _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})"
                                  ).format(text, elementName, ID, k))
                        uniqueDict["Element_" + elementName].append(text)
            else:
                logging.warning(
                    _("Unexpected element: {} in record {}").format(
                        element.tag, k))
コード例 #23
0
    def __validateSystem( self, systemName ):
        """
        Checks for basic formatting/content errors in a Bible book name system.
        """
        assert systemName
        assert self.__XMLSystems[systemName]['tree']

        if len(self.__XMLSystems[systemName]["languageCode"]) != 3:
            logging.error( _("Couldn't find 3-letter language code in {!r} book names system").format( systemName ) )
        #if self.__ISOLanguages and not self.__ISOLanguages.isValidLanguageCode( self.__XMLSystems[systemName]["languageCode"] ): # Check that we have a valid language code
            #logging.error( _("Unrecognized {!r} ISO-639-3 language code in {!r} book names system").format( self.__XMLSystems[systemName]["languageCode"], systemName ) )

        uniqueDict = {}
        for index in range( 0, len(self.mainElementTags) ):
            for elementName in self.uniqueElements[index]: uniqueDict["Element_"+str(index)+"_"+elementName] = []
            for attributeName in self.uniqueAttributes[index]: uniqueDict["Attribute_"+str(index)+"_"+attributeName] = []

        expectedID = 1
        for k,element in enumerate(self.__XMLSystems[systemName]['tree']):
            if element.tag in self.mainElementTags:
                BibleOrgSysGlobals.checkXMLNoText( element, element.tag )
                BibleOrgSysGlobals.checkXMLNoTail( element, element.tag )
                if not self.compulsoryAttributes and not self.optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag )
                if not self.compulsoryElements and not self.optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag )

                index = self.mainElementTags.index( element.tag )

                # Check compulsory attributes on this main element
                for attributeName in self.compulsoryAttributes[index]:
                    attributeValue = element.get( attributeName )
                    if attributeValue is None:
                        logging.error( _("Compulsory {!r} attribute is missing from {} element in record {} in {}").format( attributeName, element.tag, k, systemName ) )
                    if not attributeValue:
                        logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {} in {}").format( attributeName, element.tag, k, systemName ) )

                # Check optional attributes on this main element
                for attributeName in self.optionalAttributes[index]:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None:
                        if not attributeValue:
                            logging.warning( _("Optional {!r} attribute is blank on {} element in record {} in {}").format( attributeName, element.tag, k, systemName ) )

                # Check for unexpected additional attributes on this main element
                for attributeName in element.keys():
                    attributeValue = element.get( attributeName )
                    if attributeName not in self.compulsoryAttributes[index] and attributeName not in self.optionalAttributes[index]:
                        logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {} in {}").format( attributeName, attributeValue, element.tag, k, systemName ) )

                # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes)
                for attributeName in self.uniqueAttributes[index]:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None:
                        if attributeValue in uniqueDict["Attribute_"+str(index)+"_"+attributeName]:
                            logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {} in {}").format( attributeValue, attributeName, element.tag, k, systemName ) )
                        uniqueDict["Attribute_"+str(index)+"_"+attributeName].append( attributeValue )

                # Check compulsory elements
                for elementName in self.compulsoryElements[index]:
                    if element.find( elementName ) is None:
                        logging.error( _("Compulsory {!r} element is missing (record {}) in {}").format( elementName, k, systemName ) )
                    if not element.find( elementName ).text:
                        logging.warning( _("Compulsory {!r} element is blank (record {}) in {}").format( elementName, k, systemName ) )

                # Check optional elements
                for elementName in self.optionalElements[index]:
                    if element.find( elementName ) is not None:
                        if not element.find( elementName ).text:
                            logging.warning( _("Optional {!r} element is blank (record {}) in {}").format( elementName, k, systemName ) )

                # Check for unexpected additional elements
                for subelement in element:
                    if subelement.tag not in self.compulsoryElements[index] and subelement.tag not in self.optionalElements[index]:
                        logging.warning( _("Additional {!r} element ({!r}) found (record {}) in {} {}").format( subelement.tag, subelement.text, k, systemName, element.tag ) )

                # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements)
                for elementName in self.uniqueElements[index]:
                    if element.find( elementName ) is not None:
                        text = element.find( elementName ).text
                        if text in uniqueDict["Element_"+str(index)+"_"+elementName]:
                            myLogging = logging.info if element.tag == 'BibleDivisionNames' else logging.error
                            myLogging( _("Found {!r} data repeated in {!r} element (record {}) in {}").format( text, elementName, k, systemName ) )
                        uniqueDict["Element_"+str(index)+"_"+elementName].append( text )
            else:
                logging.warning( _("Unexpected element: {} in record {} in {}").format( element.tag, k, systemName ) )
コード例 #24
0
ファイル: USFXXMLBible.py プロジェクト: alerque/BibleOrgSys
 def loadFootnote( self, element, location, BBB, C, V ):
     """
     Handles footnote fields, including xt field.
     """
     text, tail = clean(element.text), clean(element.tail)
     caller = None
     for attrib,value in element.items():
         if attrib == 'caller':
             caller = value
         else:
             logging.warning( _("dg35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
     self.thisBook.appendToLastLine( ' \\f {}{}'.format( caller, (' '+text) if text else '' ) )
     for subelement in element:
         sublocation = subelement.tag + " of " + location
         marker, fText, fTail = subelement.tag, clean(subelement.text), clean(subelement.tail)
         #print( "USFX.loadFootnote", repr(caller), repr(text), repr(tail), repr(marker), repr(fText), repr(fTail) )
         #if BibleOrgSysGlobals.verbosityLevel > 0 and marker not in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',):
             #print( "USFX.loadFootnote found", repr(caller), repr(marker), repr(fText), repr(fTail) )
         if BibleOrgSysGlobals.debugFlag: assert( marker in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq','xt',) )
         if marker=='ref':
             assert( fText )
             BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'ls13' )
             target = None
             for attrib,value in subelement.items():
                 if attrib == 'tgt': target = value
                 else:
                     logging.warning( _("gs35 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
             if target:
                 self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, fText ) )
             else: halt
         else:
             BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'dq54' )
             self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, fText ) )
             if marker=='xt' or marker[0]=='f': # Starts with f, e.g., fr, ft
                 for sub2element in subelement:
                     sub2location = sub2element.tag + " of " + sublocation
                     marker2, fText2, fTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail)
                     BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'js72' )
                     if marker2 == 'ref':
                         #print( sub2location )
                         if fText2:
                             #print( 'ft2', marker2, repr(fText2), repr(fTail2), sub2location )
                             self.thisBook.appendToLastLine( fText2 )
                         target = None
                         for attrib,value in sub2element.items():
                             if attrib == 'tgt': target = value # OSIS style reference, e.g., '1SA.27.8'
                             else:
                                 logging.warning( _("hd52 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) )
                         if target:
                             #print( 'tg', marker2, repr(target) )
                             self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) )
                         else:
                             if debuggingThisModule: halt
                     elif marker2 in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting
                         self.loadCharacterFormatting( sub2element, sub2location, BBB, C, V )
                     else:
                         print( 'Ignored marker2', repr(marker2), BBB, C, V )
                         if debuggingThisModule: halt
                     if fTail2: self.thisBook.appendToLastLine( fTail2 )
             elif marker in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting
                 self.loadCharacterFormatting( subelement, sublocation, BBB, C, V )
             else:
                 print( 'Ignored marker', repr(marker), BBB, C, V )
                 halt
         if fTail:
             self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, fTail ) )
     self.thisBook.appendToLastLine( '\\f*{}'.format( (' '+tail) if tail else '' ) )
コード例 #25
0
    def __validateAndExtractVerse( self, BBB, chapterNumber, thisBook, verse ):
        """
        Check/validate and extract chapter data from the given XML book record
            finding and saving chapter numbers and
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML verse…") )

        location = "verse in {} {}".format( BBB, chapterNumber )
        BibleOrgSysGlobals.checkXMLNoTail( verse, location, 'l5ks' )

        # Handle verse attributes
        verseNumber = toVerseNumber = None
        for attrib,value in verse.items():
            if attrib == 'vnumber':
                verseNumber = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) )
        if BibleOrgSysGlobals.debugFlag: assert verseNumber
        location = "{}:{}".format( location, verseNumber ) # Get a better location description
        #thisBook.addLine( 'v', verseNumber )
        vText = verse.text
        if vText: vText = vText.strip()
        #if not vText: # This happens if a verse starts immediately with a style or note
            #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) )

        # Handle verse subelements (notes and styled portions)
        for subelement in verse:
            if subelement.tag == ZefaniaXMLBible.noteTag:
                sublocation = "note in " + location
                noteType = None
                for attrib,value in subelement.items():
                    if attrib == 'type':
                        noteType = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
                if noteType not in ('n-studynote','x-studynote',):
                    logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) )
                if BibleOrgSysGlobals.debugFlag: assert noteType
                nText, nTail = subelement.text, subelement.tail
                #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) )
                #thisBook.addLine( 'ST', css ) # XXXXXXXXXXXXXXXXXXXXXXXXXX Losing data here (for now)
                #thisBook.addLine( 'ST=', nText )
                if nTail:
                    if '\n' in nTail:
                        print( "ZefaniaXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) )
                        nTail = nTail.replace( '\n', ' ' )
                    thisBook.addLine( 'v~', nTail )
                for sub2element in subelement:
                    if sub2element.tag == ZefaniaXMLBible.styleTag:
                        sub2location = "style in " + sublocation
                        BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fyt4' )
                        css = idStyle = None
                        for attrib,value in sub2element.items():
                            if attrib == 'css':
                                css = value
                            elif attrib == 'id':
                                idStyle = value
                            else: logging.warning( "Unprocessed {!r} attribute ({}) in style sub2element".format( attrib, value ) )
                        if BibleOrgSysGlobals.debugFlag: assert css or idStyle
                        SFM = None
                        if css=='font-style:italic' or css=='font-style:italic;': SFM = '\\it'
                        elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit'
                        elif css == "color:#FF0000": SFM = '\\em'
                        elif css == "font-size: x-small; color:#8B8378": SFM = '\\add'
                        elif css is None and idStyle=='cl:divineName': SFM = '\\nd'
                        else:
                            logging.error( "Ignored1 css is {!r} idStyle is {!r}".format( css, idStyle ) )
                            if BibleOrgSysGlobals.debugFlag: halt
                        sText, sTail = sub2element.text.strip(), sub2element.tail
                        if BibleOrgSysGlobals.debugFlag: assert sText
                        if SFM: vText += SFM+' ' + sText + SFM+'*'
                        else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles
                        if sTail: vText += sTail.strip()
                    else: logging.error( "Expected to find {} but got {!r} in {}".format( ZefaniaXMLBible.styleTag, sub2element.tag, sublocation ) )

            elif subelement.tag == ZefaniaXMLBible.styleTag:
                sublocation = "style in " + location
                css = idStyle = None
                for attrib,value in subelement.items():
                    if attrib == 'css':
                        css = value
                    elif attrib == 'id':
                        idStyle = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert css or idStyle
                SFM = None
                if css=='font-style:italic' or css=='font-style:italic;': SFM = '\\it'
                elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit'
                elif css == "color:#FF0000": SFM = '\\em'
                elif css == "font-size: x-small; color:#8B8378": SFM = '\\add'
                elif css is None and idStyle=='cl:divineName': SFM = '\\nd'
                else:
                    logging.error( "Ignored2 css is {!r} idStyle is {!r}".format( css, idStyle ) )
                    if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt

                for sub2element in subelement:
                    if sub2element.tag == ZefaniaXMLBible.grTag:
                        sub2location = "gr in " + sublocation
                        BibleOrgSysGlobals.checkXMLNoAttributes( sub2element, sub2location, 'ks12' )
                        BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'dl36' )
                        BibleOrgSysGlobals.checkXMLNoTail( sub2element, sub2location, 'js24' )
                        grText = sub2element.text.strip() if sub2element.text else ''
                        logging.error( "Unfinished to process 'gr' {!r} sub2element ({}) in style subelement".format( grText, sublocation ) )
                    else: logging.error( "Expected to find 'gr' but got {!r} in {}".format( sub2element.tag, sublocation ) )

                #sText, sTail = subelement.text.strip(), subelement.tail
                sText = subelement.text.strip() if subelement.text else ''
                sTail = subelement.tail.strip() if subelement.tail else None
                if BibleOrgSysGlobals.debugFlag and debuggingThisModule: assert sText
                if SFM: vText += SFM+' ' + sText + SFM+'*'
                else: vText += '\\sc ' + '['+(css if css else '')+']' + sText + '\\sc* ' # Use sc for unknown styles
                if sTail: vText += sTail

            elif subelement.tag == ZefaniaXMLBible.breakTag:
                sublocation = "line break in " + location
                BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' )
                BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' )
                art = None
                for attrib,value in subelement.items():
                    if attrib == 'art':
                        art = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl'
                #print( BBB, chapterNumber, verseNumber )
                #assert vText
                if vText:
                    if '\n' in vText:
                        logging.warning( "ZefaniaXMLBible.__validateAndExtractVerse_a: newline in vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                        vText = vText.replace( '\n', ' ' )
                    thisBook.addLine( 'v', verseNumber + ' ' + vText )
                    vText = ''
                breakText = subelement.tail.strip() if subelement.tail else ''
                if '\n' in breakText:
                    logging.warning( "ZefaniaXMLBible.__validateAndExtractVerse: newline in breakText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, breakText ) )
                    breakText = breakText.replace( '\n', ' ' )
                thisBook.addLine( 'm', breakText )

            elif subelement.tag == ZefaniaXMLBible.divTag:
                sublocation = "div break in " + location
                BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'ld46' )
                BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'kx10' )
                BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'las9' )
                for sub2element in subelement:
                    if sub2element.tag == 'NOTE':
                        sub2location = "NOTE in " + sublocation
                        BibleOrgSysGlobals.checkXMLNoAttributes( sub2element, sub2location, 'lc35' )
                        BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'ks27' )
                        BibleOrgSysGlobals.checkXMLNoTail( sub2element, sub2location, 'ksd1' )
                        noteText = sub2element.text.strip() if sub2element.text else ''
                        vText += '\\f {}\\f*'.format( noteText )
                    else: logging.error( "Expected to find 'NOTE' but got {!r} in {}".format( sub2element.tag, sublocation ) )

            elif subelement.tag == ZefaniaXMLBible.grTag:
                sublocation = "gr in " + location
                BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'ksd2' )
                BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'ls10' )
                BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'cg27' )
                grText = subelement.text.strip() if subelement.text else ''
                logging.error( "Unfinished to process 'gr' {!r} subelement ({}) in style subelement".format( grText, location ) )

            else: logging.error( "Expected to find NOTE or STYLE or BREAK or DIV but got {!r} in {}".format( subelement.tag, location ) )

        if vText: # This is the main text of the verse (follows the verse milestone)
            if '\n' in vText:
                logging.warning( "ZefaniaXMLBible.__validateAndExtractVerse_b: newline in vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                vText = vText.replace( '\n', ' ' )
            thisBook.addLine( 'v', verseNumber + ' ' + vText )
コード例 #26
0
    def __validateSystem(self, systemName):
        """
        Checks for basic formatting/content errors in a Bible book name system.
        """
        assert systemName
        assert self.__XMLSystems[systemName]['tree']

        if len(self.__XMLSystems[systemName]["languageCode"]) != 3:
            logging.error(
                _("Couldn't find 3-letter language code in {!r} book names system"
                  ).format(systemName))
        #if self.__ISOLanguages and not self.__ISOLanguages.isValidLanguageCode( self.__XMLSystems[systemName]["languageCode"] ): # Check that we have a valid language code
        #logging.error( _("Unrecognized {!r} ISO-639-3 language code in {!r} book names system").format( self.__XMLSystems[systemName]["languageCode"], systemName ) )

        uniqueDict = {}
        for index in range(0, len(self.mainElementTags)):
            for elementName in self.uniqueElements[index]:
                uniqueDict["Element_" + str(index) + "_" + elementName] = []
            for attributeName in self.uniqueAttributes[index]:
                uniqueDict["Attribute_" + str(index) + "_" +
                           attributeName] = []

        expectedID = 1
        for k, element in enumerate(self.__XMLSystems[systemName]['tree']):
            if element.tag in self.mainElementTags:
                BibleOrgSysGlobals.checkXMLNoText(element, element.tag)
                BibleOrgSysGlobals.checkXMLNoTail(element, element.tag)
                if not self.compulsoryAttributes and not self.optionalAttributes:
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, element.tag)
                if not self.compulsoryElements and not self.optionalElements:
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, element.tag)

                index = self.mainElementTags.index(element.tag)

                # Check compulsory attributes on this main element
                for attributeName in self.compulsoryAttributes[index]:
                    attributeValue = element.get(attributeName)
                    if attributeValue is None:
                        logging.error(
                            _("Compulsory {!r} attribute is missing from {} element in record {} in {}"
                              ).format(attributeName, element.tag, k,
                                       systemName))
                    if not attributeValue:
                        logging.warning(
                            _("Compulsory {!r} attribute is blank on {} element in record {} in {}"
                              ).format(attributeName, element.tag, k,
                                       systemName))

                # Check optional attributes on this main element
                for attributeName in self.optionalAttributes[index]:
                    attributeValue = element.get(attributeName)
                    if attributeValue is not None:
                        if not attributeValue:
                            logging.warning(
                                _("Optional {!r} attribute is blank on {} element in record {} in {}"
                                  ).format(attributeName, element.tag, k,
                                           systemName))

                # Check for unexpected additional attributes on this main element
                for attributeName in element.keys():
                    attributeValue = element.get(attributeName)
                    if attributeName not in self.compulsoryAttributes[
                            index] and attributeName not in self.optionalAttributes[
                                index]:
                        logging.warning(
                            _("Additional {!r} attribute ({!r}) found on {} element in record {} in {}"
                              ).format(attributeName, attributeValue,
                                       element.tag, k, systemName))

                # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes)
                for attributeName in self.uniqueAttributes[index]:
                    attributeValue = element.get(attributeName)
                    if attributeValue is not None:
                        if attributeValue in uniqueDict["Attribute_" +
                                                        str(index) + "_" +
                                                        attributeName]:
                            logging.error(
                                _("Found {!r} data repeated in {!r} field on {} element in record {} in {}"
                                  ).format(attributeValue, attributeName,
                                           element.tag, k, systemName))
                        uniqueDict["Attribute_" + str(index) + "_" +
                                   attributeName].append(attributeValue)

                # Check compulsory elements
                for elementName in self.compulsoryElements[index]:
                    if element.find(elementName) is None:
                        logging.error(
                            _("Compulsory {!r} element is missing (record {}) in {}"
                              ).format(elementName, k, systemName))
                    if not element.find(elementName).text:
                        logging.warning(
                            _("Compulsory {!r} element is blank (record {}) in {}"
                              ).format(elementName, k, systemName))

                # Check optional elements
                for elementName in self.optionalElements[index]:
                    if element.find(elementName) is not None:
                        if not element.find(elementName).text:
                            logging.warning(
                                _("Optional {!r} element is blank (record {}) in {}"
                                  ).format(elementName, k, systemName))

                # Check for unexpected additional elements
                for subelement in element:
                    if subelement.tag not in self.compulsoryElements[
                            index] and subelement.tag not in self.optionalElements[
                                index]:
                        logging.warning(
                            _("Additional {!r} element ({!r}) found (record {}) in {} {}"
                              ).format(subelement.tag, subelement.text, k,
                                       systemName, element.tag))

                # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements)
                for elementName in self.uniqueElements[index]:
                    if element.find(elementName) is not None:
                        text = element.find(elementName).text
                        if text in uniqueDict["Element_" + str(index) + "_" +
                                              elementName]:
                            myLogging = logging.info if element.tag == 'BibleDivisionNames' else logging.error
                            myLogging(
                                _("Found {!r} data repeated in {!r} element (record {}) in {}"
                                  ).format(text, elementName, k, systemName))
                        uniqueDict["Element_" + str(index) + "_" +
                                   elementName].append(text)
            else:
                logging.warning(
                    _("Unexpected element: {} in record {} in {}").format(
                        element.tag, k, systemName))
コード例 #27
0
    def load( self ):
        """
        Load a single source XML file and load book elements.
        """
        if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) )
        self.tree = ElementTree().parse( self.sourceFilepath )
        if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all

        if self.suppliedMetadata is None: self.suppliedMetadata = {}
        self.suppliedMetadata['VerseView'] = {}

        # Find the main (bible) container
        if self.tree.tag == VerseViewXMLBible.treeTag:
            location = "VerseView XML file"
            BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' )
            BibleOrgSysGlobals.checkXMLNoAttributes( self.tree, location, 'js24' )
            BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' )

            # Find the submain (various info and then book) containers
            bookNumber = 0
            for element in self.tree:
                if element.tag == VerseViewXMLBible.filenameTag:
                    sublocation = "filename in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' )
                    #self.filename = element.text
                elif element.tag == VerseViewXMLBible.revisionTag:
                    sublocation = "revision in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' )
                    self.suppliedMetadata['VerseView']['Revision'] = element.text
                elif element.tag == VerseViewXMLBible.titleTag:
                    sublocation = "title in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' )
                    self.suppliedMetadata['VerseView']['Title'] = element.text
                elif element.tag == VerseViewXMLBible.fontTag:
                    sublocation = "font in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' )
                    self.suppliedMetadata['VerseView']['Font'] = element.text
                elif element.tag == VerseViewXMLBible.copyrightTag:
                    sublocation = "copyright in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' )
                    self.suppliedMetadata['VerseView']['Copyright'] = element.text
                elif element.tag == VerseViewXMLBible.sizefactorTag:
                    sublocation = "sizefactor in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' )
                    if BibleOrgSysGlobals.debugFlag: assert element.text == '1'
                elif element.tag == VerseViewXMLBible.bookTag:
                    sublocation = "book in " + location
                    BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' )
                    bookNumber += 1
                    self.__validateAndExtractBook( element, bookNumber )
                else: logging.error( "xk15 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.bookTag, element.tag ) )
        else: logging.error( "Expected to load {!r} but got {!r}".format( VerseViewXMLBible.treeTag, self.tree.tag ) )

        if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2:
            # These are all compulsory so they should all exist
            #print( "Filename is {!r}".format( self.filename ) )
            print( "Revision is {!r}".format( self.suppliedMetadata['VerseView']['Revision'] ) )
            print( "Title is {!r}".format( self.suppliedMetadata['VerseView']['Title'] ) )
            print( "Font is {!r}".format( self.suppliedMetadata['VerseView']['Font'] ) )
            print( "Copyright is {!r}".format( self.suppliedMetadata['VerseView']['Copyright'] ) )
            #print( "SizeFactor is {!r}".format( self.sizeFactor ) )

        self.applySuppliedMetadata( 'VerseView' ) # Copy some to self.settingsDict
        self.doPostLoadProcessing()
コード例 #28
0
    def load( self, filename, folder=None, encoding='utf-8' ):
        """
        Load a single source USX XML file and extract the information.
        """
        if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
            print( exp("load( {}, {}, {} )").format( filename, folder, encoding ) )

        def loadParagraph( paragraphXML, paragraphlocation ):
            """
            Load a paragraph from the USX XML.
            In this context, paragraph means heading and intro lines,
                as well as paragraphs of verses.

            Uses (and updates) C,V information from the containing function.
            """
            nonlocal C, V

            # Process the attributes first
            paragraphStyle = None
            for attrib,value in paragraphXML.items():
                if attrib=='style':
                    paragraphStyle = value # This is basically the USFM marker name
                else:
                    logging.warning( _("CH46 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )

            # Now process the paragraph text (or write a paragraph marker anyway)
            paragraphText = paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else ''
            if version is None: paragraphText = paragraphText.rstrip() # Don't need to strip extra spaces in v2
            self.addLine( paragraphStyle, paragraphText )

            # Now process the paragraph subelements
            for element in paragraphXML:
                location = element.tag + ' ' + paragraphlocation
                #print( "USXXMLBibleBook.load", C, V, element.tag, location )
                if element.tag == 'verse': # milestone (not a container)
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    verseStyle = altNumber = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            V = value
                        elif attrib=='style':
                            verseStyle = value
                        elif attrib=='altnumber':
                            altNumber = value
                        else:
                            logging.error( _("KR60 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    if verseStyle != 'v':
                        logging.error( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) )
                    #if altNumber: print( repr(verseStyle), repr(altNumber) ); halt
                    altStuff = ' \\va {}\\va*'.format( altNumber ) if altNumber else ''
                    self.addLine( verseStyle, V + altStuff + ' ' )
                    # Now process the tail (if there's one) which is the verse text
                    if element.tail:
                        vText = element.tail
                        if vText[0]=='\n': vText = vText.lstrip() # Paratext puts cross references on a new line
                        if vText:
                            #print( repr(vText) )
                            self.appendToLastLine( vText )
                elif element.tag == 'char':
                    # Process the attributes first
                    charStyle = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            charStyle = value # This is basically the USFM character marker name
                            #print( "  charStyle", charStyle )
                            assert not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle )
                        else:
                            logging.error( _("QU52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    charLine = "\\{} {} ".format( charStyle, element.text )
                    # Now process the subelements -- chars are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( '{} {}:{} {}'.format( self.BBB, C, V, element.tag ) )
                        if subelement.tag == 'char': # milestone (not a container)
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            subCharStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style': subCharStyle = value
                                elif attrib=='closed':
                                    assert value=='false'
                                    charClosed = False
                                else:
                                    logging.error( _("KS41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            charLine += "\\{} {}".format( subCharStyle, subelement.text )
                            if charClosed: charLine += "\\{}*".format( subCharStyle )
                            #if subelement.tail is not None: print( "  tail1", repr(subelement.tail) )
                            charLine += '' if subelement.tail is None else subelement.tail
                        else:
                            logging.error( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) )
                            self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) )
                    # A character field must be added to the previous field
                    #if element.tail is not None: print( " tail2", repr(element.tail) )
                    charTail = ''
                    if element.tail:
                        charTail = element.tail
                        if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts footnote parts on new lines
                    charLine += "\\{}*{}".format( charStyle, charTail )
                    #if debuggingThisModule: print( "USX.loadParagraph:", C, V, paragraphStyle, charStyle, repr(charLine) )
                    self.appendToLastLine( charLine )
                elif element.tag == 'note':
                    #print( "NOTE", BibleOrgSysGlobals.elementStr( element ) )
                    # Process the attributes first
                    noteStyle = noteCaller = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            noteStyle = value # This is basically the USFM marker name
                            assert noteStyle in ('x','f',)
                        elif attrib=='caller': noteCaller = value # Usually hyphen or a symbol to be used for the note
                        else:
                            logging.error( _("CY38 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    if noteCaller=='' and self.BBB=='NUM' and C=='10' and V=='36': noteCaller = '+' # Hack
                    assert noteStyle and noteCaller # both compulsory
                    noteLine = "\\{} {} ".format( noteStyle, noteCaller )
                    if element.text:
                        noteText = element.text.strip()
                        noteLine += noteText
                    # Now process the subelements -- notes are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( C, V, subelement.tag )
                        if subelement.tag == 'char': # milestone (not a container)
                            # Process the attributes first
                            charStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style':
                                    charStyle = value
                                elif attrib=='closed':
                                    assert value=='false'
                                    charClosed = False
                                else:
                                    logging.warning( _("GJ67 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            noteLine += "\\{} {}".format( charStyle, subelement.text )
                            # Now process the subelements -- notes are one of the few multiply embedded fields in USX
                            for sub2element in subelement:
                                sub2location = sub2element.tag + ' ' + sublocation
                                #print( C, V, sub2element.tag )
                                if sub2element.tag == 'char': # milestone (not a container)
                                    BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location )
                                    # Process the attributes first
                                    char2Style, char2Closed = None, True
                                    for attrib,value in sub2element.items():
                                        if attrib=='style':
                                            char2Style = value
                                        elif attrib=='closed':
                                            assert value=='false'
                                            char2Closed = False
                                        else:
                                            logging.warning( _("VH36 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) )
                                    assert char2Closed
                                    noteLine += "\\{} {}\\{}*{}".format( char2Style, sub2element.text, char2Style, sub2element.tail if sub2element.tail else '' )
                            if charClosed: noteLine += "\\{}*".format( charStyle )
                            if subelement.tail:
                                charTail = subelement.tail
                                if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts cross reference parts on a new line
                                noteLine += charTail
                        elif subelement.tag == 'unmatched': # Used to denote errors in the source text
                            BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation )
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            unmmatchedMarker = None
                            for attrib,value in subelement.items():
                                if attrib=='marker':
                                    unmmatchedMarker = value
                                else:
                                    logging.warning( _("NV21 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            self.addPriorityError( 2, C, V, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) )
                        else:
                            logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) )
                            self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) )
                        if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail
                    #noteLine += "\\{}*".format( charStyle )
                    noteLine += "\\{}*".format( noteStyle )
                    if element.tail:
                        #if '\n' in element.tail: halt
                        noteTail = element.tail
                        if noteTail[0]=='\n': noteTail = noteTail.lstrip() # Paratext puts multiple cross-references on new lines
                        noteLine += noteTail
                    #print( "NoteLine", repr(noteLine) )
                    self.appendToLastLine( noteLine )
                elif element.tag == 'link': # Used to include extra resources
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    linkStyle = linkDisplay = linkTarget = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            linkStyle = value
                            assert linkStyle in ('jmp',)
                        elif attrib=='display':
                            linkDisplay = value # e.g., "click here"
                        elif attrib=='target':
                            linkTarget = value # e.g., some reference
                        else:
                            logging.warning( _("KW54 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    self.addPriorityError( 3, C, V, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) )
                elif element.tag == 'unmatched': # Used to denote errors in the source text
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    self.addPriorityError( 2, C, V, _("Unmatched element in {}").format( location) )
                else:
                    logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, location ) )
                    self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) )
                    for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] )
                    if BibleOrgSysGlobals.debugFlag: halt
        # end of loadParagraph

        C = V = '0'
        loadErrors = []
        lastMarker = None

        if BibleOrgSysGlobals.verbosityLevel > 3: print( "  " + _("Loading {} from {}…").format( filename, folder ) )
        elif BibleOrgSysGlobals.verbosityLevel > 2: print( "  " + _("Loading {}…").format( filename ) )
        self.isOneChapterBook = self.BBB in BibleOrgSysGlobals.BibleBooksCodes.getSingleChapterBooksList()
        self.sourceFilename = filename
        self.sourceFolder = folder
        self.sourceFilepath = os.path.join( folder, filename ) if folder else filename
        try: self.tree = ElementTree().parse( self.sourceFilepath )
        except ParseError as err:
            logging.critical( exp("Loader parse error in xml file {}: {} {}").format( filename, sys.exc_info()[0], err ) )
            loadErrors.append( exp("Loader parse error in xml file {}: {} {}").format( filename, sys.exc_info()[0], err ) )
            self.addPriorityError( 100, C, V, _("Loader parse error in xml file {}: {}").format( filename, err ) )
        if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all

        # Find the main container
        if 'tree' in dir(self) \
        and ( self.tree.tag=='usx' or self.tree.tag=='usfm' ): # Not sure why both are allowable
            location = "USX ({}) file".format( self.tree.tag )
            BibleOrgSysGlobals.checkXMLNoText( self.tree, location )
            BibleOrgSysGlobals.checkXMLNoTail( self.tree, location )

            # Process the attributes first
            self.schemaLocation = ''
            version = None
            for attrib,value in self.tree.items():
                if attrib=='version': version = value
                else: logging.warning( _("DG84 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
            if version not in ( None, '2.0' ):
                logging.warning( _("Not sure if we can handle v{} USX files").format( version ) )

            # Now process the data
            for element in self.tree:
                sublocation = element.tag + " " + location
                if element.tag == 'book': # milestone (not a container)
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation )
                    # Process the attributes
                    idField = bookStyle = None
                    for attrib,value in element.items():
                        if attrib=='id' or attrib=='code':
                            idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode)
                            #if idField != BBB:
                            #    logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) )
                        elif attrib=='style':
                            bookStyle = value
                        else:
                            logging.warning( _("MD12 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                    if bookStyle != 'id':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( bookStyle, sublocation ) )
                    idLine = idField
                    if element.text and element.text.strip(): idLine += ' ' + element.text
                    self.addLine( 'id', idLine )
                elif element.tag == 'chapter': # milestone (not a container)
                    V = '0'
                    BibleOrgSysGlobals.checkXMLNoText( element, sublocation )
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation )
                    # Process the attributes
                    chapterStyle = pubNumber = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            C = value
                        elif attrib=='style':
                            chapterStyle = value
                        elif attrib=='pubnumber':
                            pubNumber = value
                        else:
                            logging.error( _("LY76 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                    if chapterStyle != 'c':
                        logging.warning( _("Unexpected style attribute ({}) in {}").format( chapterStyle, sublocation ) )
                    #if pubNumber: print( self.BBB, C, repr(pubNumber) ); halt
                    self.addLine( 'c', C )
                    if pubNumber: self.addLine( 'cp', pubNumber )
                elif element.tag == 'para':
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation )
                    USFMMarker = element.attrib['style'] # Get the USFM code for the paragraph style
                    if BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( USFMMarker ):
                        #if lastMarker: self.addLine( lastMarker, lastText )
                        #lastMarker, lastText = USFMMarker, text
                        loadParagraph( element, sublocation )
                    elif BibleOrgSysGlobals.USFMMarkers.isInternalMarker( USFMMarker ): # the line begins with an internal USFM Marker -- append it to the previous line
                        text = element.text
                        if text is None: text = ''
                        if BibleOrgSysGlobals.debugFlag:
                            print( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) )
                            #halt # Not checked yet
                        if text:
                            loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) )
                            logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, C, V, text ) )
                        else: # no text
                            loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM Marker at beginning of line (with no text)").format( self.BBB, C, V, USFMMarker ) )
                            logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, C, V ) )
                        self.addPriorityError( 97, C, V, _("Found \\{} internal USFM Marker on new line in file").format( USFMMarker ) )
                        #lastText += '' if lastText.endswith(' ') else ' ' # Not always good to add a space, but it's their fault!
                        lastText =  '\\' + USFMMarker + ' ' + text
                        #print( "{} {} {} Now have {}:{!r}".format( self.BBB, C, V, lastMarker, lastText ) )
                    else: # the line begins with an unknown USFM Marker
                        try: status = element.attrib['status']
                        except KeyError: status = None
                        text = element.text
                        if text:
                            loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) )
                            logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, C, V, text ) )
                        else: # no text
                            loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line (with no text").format( self.BBB, C, V, USFMMarker ) )
                            logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, C, V ) )
                        self.addPriorityError( 100, C, V, _("Found \\{} unknown USFM Marker on new line in file").format( USFMMarker ) )
                        if status == 'unknown': # USX exporter already knew it was a bad marker
                            pass # Just drop it completely
                        else:
                            for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space
                                if USFMMarker.startswith( tryMarker ): # Let's try changing it
                                    if lastMarker: self.addLine( lastMarker, lastText )
                                    lastMarker, lastText = tryMarker, USFMMarker[len(tryMarker):] + ' ' + text
                                    loadErrors.append( _("{} {}:{} Changed '\\{}' unknown USFM Marker to {!r} at beginning of line: {}").format( self.BBB, C, V, USFMMarker, tryMarker, text ) )
                                    logging.warning( _("Changed '\\{}' unknown USFM Marker to {!r} after {} {}:{} at beginning of line: {}").format( USFMMarker, tryMarker, self.BBB, C, V, text ) )
                                    break
                        # Otherwise, don't bother processing this line -- it'll just cause more problems later on
                else:
                    logging.error( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, sublocation ) )
                    self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) )

        if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
コード例 #29
0
    def __validateSystem( self, bookOrderTree, systemName ):
        """ Do a semi-automatic check of the XML file validity. """
        assert bookOrderTree

        uniqueDict = {}
        for elementName in self.uniqueElements: uniqueDict["Element_"+elementName] = []
        for attributeName in self.uniqueAttributes: uniqueDict["Attribute_"+attributeName] = []

        expectedID = 1
        for k,element in enumerate(bookOrderTree):
            if element.tag == self.mainElementTag:
                BibleOrgSysGlobals.checkXMLNoTail( element, element.tag )
                if not self.compulsoryAttributes and not self.optionalAttributes: BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag )
                if not self.compulsoryElements and not self.optionalElements: BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag )

                # Check ascending ID field
                ID = element.get("id")
                intID = int( ID )
                if intID != expectedID:
                    logging.error( _("ID numbers out of sequence in record {} (got {} when expecting {}) for {}").format( k, intID, expectedID, systemName ) )
                expectedID += 1

                # Check that this is unique
                if element.text:
                    if element.text in uniqueDict:
                        logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {}) for {}").format( element.text, element.tag, ID, k, systemName ) )
                    uniqueDict[element.text] = None

                # Check compulsory attributes on this main element
                for attributeName in self.compulsoryAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is None:
                        logging.error( _("Compulsory {!r} attribute is missing from {} element in record {}").format( attributeName, element.tag, k ) )
                    if not attributeValue:
                        logging.warning( _("Compulsory {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, k ) )

                # Check optional attributes on this main element
                for attributeName in self.optionalAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None:
                        if not attributeValue:
                            logging.warning( _("Optional {!r} attribute is blank on {} element in record {}").format( attributeName, element.tag, k ) )

                # Check for unexpected additional attributes on this main element
                for attributeName in element.keys():
                    attributeValue = element.get( attributeName )
                    if attributeName not in self.compulsoryAttributes and attributeName not in self.optionalAttributes:
                        logging.warning( _("Additional {!r} attribute ({!r}) found on {} element in record {}").format( attributeName, attributeValue, element.tag, k ) )

                # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes)
                for attributeName in self.uniqueAttributes:
                    attributeValue = element.get( attributeName )
                    if attributeValue is not None:
                        if attributeValue in uniqueDict["Attribute_"+attributeName]:
                            logging.error( _("Found {!r} data repeated in {!r} field on {} element in record {}").format( attributeValue, attributeName, element.tag, k ) )
                        uniqueDict["Attribute_"+attributeName].append( attributeValue )

                # Check compulsory elements
                for elementName in self.compulsoryElements:
                    if element.find( elementName ) is None:
                        logging.error( _("Compulsory {!r} element is missing in record with ID {!r} (record {})").format( elementName, ID, k ) )
                    if not element.find( elementName ).text:
                        logging.warning( _("Compulsory {!r} element is blank in record with ID {!r} (record {})").format( elementName, ID, k ) )

                # Check optional elements
                for elementName in self.optionalElements:
                    if element.find( elementName ) is not None:
                        if not element.find( elementName ).text:
                            logging.warning( _("Optional {!r} element is blank in record with ID {!r} (record {})").format( elementName, ID, k ) )

                # Check for unexpected additional elements
                for subelement in element:
                    if subelement.tag not in self.compulsoryElements and subelement.tag not in self.optionalElements:
                        logging.warning( _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})").format( subelement.tag, subelement.text, ID, k ) )

                # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements)
                for elementName in self.uniqueElements:
                    if element.find( elementName ) is not None:
                        text = element.find( elementName ).text
                        if text in uniqueDict["Element_"+elementName]:
                            logging.error( _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})").format( text, elementName, ID, k ) )
                        uniqueDict["Element_"+elementName].append( text )
            else:
                logging.warning( _("Unexpected element: {} in record {}").format( element.tag, k ) )
コード例 #30
0
    def validateEntry( self, entry ):
        """
        Check/validate the given Strongs Greek lexicon entry.
        """
        if BibleOrgSysGlobals.debugFlag: assert( entry.tag == "entry" )
        BibleOrgSysGlobals.checkXMLNoText( entry, entry.tag, "na19" )
        BibleOrgSysGlobals.checkXMLNoTail( entry, entry.tag, "kaq9" )

        # Process the entry attributes first
        strongs5 = None
        for attrib,value in entry.items():
            if attrib ==  "strongs":
                strongs5 = value
                if BibleOrgSysGlobals.verbosityLevel > 2: print( "Validating {} entry...".format( strongs5 ) )
            else: logging.warning( "Unprocessed {!r} attribute ({}) in main entry element".format( attrib, value ) )
        if BibleOrgSysGlobals.debugFlag: assert( len(strongs5)==5 and strongs5.isdigit() )

        entryResults = {}
        entryString = ""
        gettingEssentials = True
        for j, element in enumerate( entry ):
            #print( strongs5, j, element.tag, repr(entryString) )
            if element.tag == "strongs":
                if BibleOrgSysGlobals.debugFlag: assert( gettingEssentials and j==0 and element.text )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag, "md3d" )
                if strongs5!='02717' and (3203 > int(strongs5) > 3302):
                    BibleOrgSysGlobals.checkXMLNoTail( element, element.tag, "f3g7" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag, "m56g" )
                strongs = element.text
                if BibleOrgSysGlobals.debugFlag: assert( strongs5.endswith( strongs ) )
                if element.tail and element.tail.strip(): entryString += element.tail.strip()
            elif element.tag == "greek":
                location = "greek in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoText( element, location, "jke0" )
                #BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "df35" )
                # Process the attributes
                translit = greek = beta = None
                for attrib,value in element.items():
                    if attrib=="translit": translit = value
                    elif attrib=="unicode": greek = value
                    elif attrib=="BETA": beta = value
                    else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) )
                if BibleOrgSysGlobals.debugFlag: assert( greek and translit and beta )
                if 'word' not in entryResults: # This is the first/main entry
                    if BibleOrgSysGlobals.debugFlag: assert( gettingEssentials and j==1 )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" )
                    entryResults['word'] = (greek, translit, beta)
                else:
                    #print( "Have multiple greek entries in " + strongs5 )
                    if BibleOrgSysGlobals.debugFlag: assert( j > 2 )
                    gettingEssentials = False
                    entryString += ' ' + BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ) #.replace( '\n', '' )
            elif element.tag == "pronunciation":
                location = "pronunciation in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoText( element, location, "iw9k" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "0s20" )
                # Process the attributes
                pronunciation = None
                for attrib,value in element.items():
                    if attrib=="strongs": pronunciation = value
                    else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) )
                if gettingEssentials:
                    #BibleOrgSysGlobals.checkXMLNoTail( element, location, "kd02" )
                    if BibleOrgSysGlobals.debugFlag:
                        assert( j == 2 )
                        assert( pronunciation )
                        assert( 'pronunciation' not in entryResults )
                    entryResults['pronunciation'] = pronunciation
                else:
                    if BibleOrgSysGlobals.debugFlag: assert( j>2 and not gettingEssentials )
                    if element.tail and element.tail.strip(): entryString += element.tail.strip().replace( '\n', '' )
            elif element.tag == "strongs_derivation":
                location = "strongs_derivation in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" )
                derivation = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' )
                #print( strongs5, "derivation", repr(derivation) )
                if BibleOrgSysGlobals.debugFlag:
                    assert( derivation and '\t' not in derivation and '\n' not in derivation )
                entryString +=  derivation
            elif element.tag == "strongs_def":
                location = "strongs_def in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, "jd28" )
                definition = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' )
                #print( strongs5, "definition", repr(definition) )
                if BibleOrgSysGlobals.debugFlag:
                    assert( definition and '\t' not in definition and '\n' not in definition )
                entryString += definition
            elif element.tag == "kjv_def":
                location = "kjv_def in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" )
                #BibleOrgSysGlobals.checkXMLNoTail( element, location, "8s2s" )
                #BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "dvb2" )
                KJVdefinition = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' )
                #print( strongs5, "KJVdefinition", repr(KJVdefinition), repr(entryString) )
                if BibleOrgSysGlobals.debugFlag: assert( KJVdefinition and '\t' not in KJVdefinition and '\n' not in KJVdefinition )
                entryString += KJVdefinition
            elif element.tag == "strongsref":
                location = "strongsref in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoText( element, location, "kls2" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "ks24" )
                strongsRef = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' )
                if BibleOrgSysGlobals.debugFlag:
                    assert( strongsRef and '\t' not in strongsRef and '\n' not in strongsRef )
                strongsRef = re.sub( '<language="GREEK" strongs="(\d{1,5})">', r'<StrongsRef>G\1</StrongsRef>', strongsRef )
                strongsRef = re.sub( '<strongs="(\d{1,5})" language="GREEK">', r'<StrongsRef>G\1</StrongsRef>', strongsRef )
                #strongsRef = re.sub( '<language="HEBREW" strongs="(\d{1,5})">', r'<StrongsRef>H\1</StrongsRef>', strongsRef )
                #strongsRef = re.sub( '<strongs="(\d{1,5})" language="HEBREW">', r'<StrongsRef>H\1</StrongsRef>', strongsRef )
                #print( strongs5, "strongsRef", repr(strongsRef) )
                entryString += ' ' + strongsRef
            elif element.tag == "see":
                location = "see in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoText( element, location, "iw9k" )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, "kd02" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "0s20" )
                # Process the attributes
                seeLanguage = seeStrongsNumber = None
                for attrib,value in element.items():
                    if attrib == "language": seeLanguage = value
                    elif attrib == "strongs": seeStrongsNumber = value # Note: No leading zeroes here
                    else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) )
                if BibleOrgSysGlobals.debugFlag:
                    assert( seeLanguage and seeStrongsNumber and seeStrongsNumber.isdigit() )
                    assert( seeLanguage in ('GREEK','HEBREW',) )
                if 'see' not in entryResults: entryResults['see'] = []
                entryResults['see'].append( ('G' if seeLanguage=='GREEK' else 'H') + seeStrongsNumber )
            else: logging.error( "2d4f Unprocessed {!r} element ({}) in entry".format( element.tag, element.text ) )

        if entryString:
            #print( strongs5, "entryString", repr(entryString) )
            if BibleOrgSysGlobals.debugFlag:
                assert( '\t' not in entryString and '\n' not in entryString )
            entryString = re.sub( '<strongsref language="GREEK" strongs="(\d{1,5})"></strongsref>',
                                r'<StrongsRef>G\1</StrongsRef>', entryString )
            entryString = re.sub( '<strongsref strongs="(\d{1,5})" language="GREEK"></strongsref>',
                                r'<StrongsRef>G\1</StrongsRef>', entryString )
            entryString = re.sub( '<strongsref language="HEBREW" strongs="(\d{1,5})"></strongsref>',
                                r'<StrongsRef>H\1</StrongsRef>', entryString )
            entryString = re.sub( '<strongsref strongs="(\d{1,5})" language="HEBREW"></strongsref>',
                                r'<StrongsRef>H\1</StrongsRef>', entryString )
            if BibleOrgSysGlobals.debugFlag:
                assert( 'strongsref' not in entryString )
            entryResults['Entry'] = entryString
        #print( "entryResults", entryResults )
        self.StrongsEntries[strongs] = entryResults
コード例 #31
0
    def importDataToPython(self):
        """
        Loads (and pivots) the data (not including the header) into suitable Python containers to use in a Python program.
        (Of course, you can just use the elementTree in self._XMLtree if you prefer.)
        """
        def makeList(parameter1, parameter2):
            """
            Returns a list containing all parameters. Parameter1 may already be a list.
            """
            if isinstance(parameter1, list):
                #assert( parameter2 not in parameter1 )
                parameter1.append(parameter2)
                return parameter1
            else:
                return [parameter1, parameter2]

        # end of makeList

        assert (self._XMLtree)
        if self.__DataList:  # We've already done an import/restructuring -- no need to repeat it
            return self.__DataList, self.__DataDict

        # We'll create a number of dictionaries with different elements as the key
        rawRefLinkList = []
        actualLinkCount = 0
        for element in self._XMLtree:
            #print( BibleOrgSysGlobals.elementStr( element ) )

            # Get these first for helpful error messages
            sourceReference = element.find('sourceReference').text
            sourceComponent = element.find('sourceComponent').text
            assert (sourceComponent in (
                'Section',
                'Verses',
                'Verse',
            ))

            BibleOrgSysGlobals.checkXMLNoText(element, sourceReference, 'kls1')
            BibleOrgSysGlobals.checkXMLNoAttributes(element, sourceReference,
                                                    'kd21')
            BibleOrgSysGlobals.checkXMLNoTail(element, sourceReference, 'so20')

            actualRawLinksList = []
            for subelement in element:
                #print( BibleOrgSysGlobals.elementStr( subelement ) )
                if subelement.tag in (
                        'sourceReference',
                        'sourceComponent',
                ):  # already processed these
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        subelement, sourceReference, 'ls12')
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        subelement, sourceReference, 'ks02')
                    BibleOrgSysGlobals.checkXMLNoTail(subelement,
                                                      sourceReference, 'sqw1')

                elif subelement.tag == 'BibleReferenceLink':
                    BibleOrgSysGlobals.checkXMLNoText(subelement,
                                                      sourceReference, 'haw9')
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        subelement, sourceReference, 'hs19')
                    BibleOrgSysGlobals.checkXMLNoTail(subelement,
                                                      sourceReference, 'jsd9')

                    targetReference = subelement.find('targetReference').text
                    targetComponent = subelement.find('targetComponent').text
                    assert (targetComponent in (
                        'Section',
                        'Verses',
                        'Verse',
                    ))
                    linkType = subelement.find('linkType').text
                    assert (linkType in (
                        'TSK',
                        'QuotedOTReference',
                        'AlludedOTReference',
                        'PossibleOTReference',
                    ))

                    actualRawLinksList.append((
                        targetReference,
                        targetComponent,
                        linkType,
                    ))
                    actualLinkCount += 1

            rawRefLinkList.append((
                sourceReference,
                sourceComponent,
                actualRawLinksList,
            ))

        if BibleOrgSysGlobals.verbosityLevel > 1:
            print("  {} raw links loaded (with {} actual raw link entries)".
                  format(len(rawRefLinkList), actualLinkCount))

        myRefLinkList = []
        actualLinkCount = 0
        BOS = BibleOrganizationalSystem("GENERIC-KJV-66-ENG")

        for j, (sourceReference, sourceComponent,
                actualRawLinksList) in enumerate(rawRefLinkList):
            # Just do some testing first
            if sourceComponent == 'Verse':
                x = SimpleVerseKey(sourceReference)
            else:
                flag = False
                try:
                    x = SimpleVerseKey(sourceReference, ignoreParseErrors=True)
                    flag = True
                except TypeError:
                    pass  # This should happen coz it should fail the SVK
                if flag:
                    logging.error("{} {!r} failed!".format(
                        sourceComponent, sourceReference))
                    raise TypeError
            # Now do the actual parsing
            parsedSourceReference = FlexibleVersesKey(sourceReference)
            if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
                print(j, sourceComponent, sourceReference,
                      parsedSourceReference)
                #assert( parsedSourceReference.getShortText().replace(' ','_') == sourceReference )
            actualLinksList = []
            for k, (targetReference, targetComponent,
                    linkType) in enumerate(actualRawLinksList):
                # Just do some testing first
                if targetComponent == 'Verse':
                    x = SimpleVerseKey(targetReference)
                else:
                    flag = False
                    try:
                        x = SimpleVerseKey(targetReference,
                                           ignoreParseErrors=True)
                        flag = True
                    except TypeError:
                        pass  # This should happen coz it should fail the SVK
                    if flag:
                        logging.error("{} {!r} failed!".format(
                            targetComponent, targetReference))
                        raise TypeError
                # Now do the actual parsing
                try:
                    parsedTargetReference = FlexibleVersesKey(targetReference)
                except TypeError:
                    print(
                        "  Temporarily ignored {!r} (TypeError from FlexibleVersesKey)"
                        .format(targetReference))
                    parsedTargetReference = None
                if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
                    print(' ', targetComponent, targetReference,
                          parsedTargetReference)
                    #assert( parsedTargetReference.getShortText().replace(' ','_',1) == targetReference )

                actualLinksList.append((
                    targetReference,
                    targetComponent,
                    parsedTargetReference,
                    linkType,
                ))
                actualLinkCount += 1

            myRefLinkList.append((
                sourceReference,
                sourceComponent,
                parsedSourceReference,
                actualLinksList,
            ))

        if BibleOrgSysGlobals.verbosityLevel > 1:
            print("  {} links processed (with {} actual link entries)".format(
                len(rawRefLinkList), actualLinkCount))
        #print( myRefLinkList ); halt
        self.__DataList = myRefLinkList

        # Now put it into my dictionaries for easy access
        # This part should be customized or added to for however you need to process the data

        # Create a link dictionary (by verse key)
        myRefLinkDict = {}
        for sourceReference, sourceComponent, parsedSourceReference, actualLinksList in myRefLinkList:
            #print( sourceReference, sourceComponent, parsedSourceReference )
            #print( sourceReference, sourceComponent, parsedSourceReference, actualLinksList )
            for verseRef in parsedSourceReference.getIncludedVerses():
                #print( verseRef )
                assert (isinstance(verseRef, SimpleVerseKey))
                if verseRef not in myRefLinkDict: myRefLinkDict[verseRef] = []
                myRefLinkDict[verseRef].append((
                    sourceReference,
                    sourceComponent,
                    parsedSourceReference,
                    actualLinksList,
                ))
            #print( myRefLinkDict ); halt
        originalLinks = len(myRefLinkDict)
        print(
            "  {} verse links added to dictionary (includes filling out spans)"
            .format(originalLinks))
        #print( myRefLinkDict ); halt

        # Create a reversed link dictionary (by verse key)
        for sourceReference, sourceComponent, parsedSourceReference, actualLinksList in myRefLinkList:
            #print( sourceReference, sourceComponent, parsedSourceReference )
            #print( sourceReference, sourceComponent, parsedSourceReference, actualLinksList )
            for targetReference, targetComponent, parsedTargetReference, linkType in actualLinksList:
                if parsedTargetReference is not None:
                    for verseRef in parsedTargetReference.getIncludedVerses():
                        #print( verseRef )
                        assert (isinstance(verseRef, SimpleVerseKey))
                        if linkType == 'TSK': reverseLinkType = 'TSKQuoted'
                        elif linkType == 'QuotedOTReference':
                            reverseLinkType = 'OTReferenceQuoted'
                        elif linkType == 'AlludedOTReference':
                            reverseLinkType = 'OTReferenceAlluded'
                        elif linkType == 'PossibleOTReference':
                            reverseLinkType = 'OTReferencePossible'
                        else:
                            halt  # Have a new linkType!
                        if verseRef not in myRefLinkDict:
                            myRefLinkDict[verseRef] = []
                        myRefLinkDict[verseRef].append(
                            (targetReference, targetComponent,
                             parsedTargetReference, [
                                 (sourceReference, sourceComponent,
                                  parsedSourceReference, reverseLinkType)
                             ]))
            #print( myRefLinkDict ); halt
        totalLinks = len(myRefLinkDict)
        reverseLinks = totalLinks - originalLinks
        print("  {} reverse links added to dictionary to give {} total".format(
            reverseLinks, totalLinks))
        #print( myRefLinkDict ); halt

        self.__DataDict = myRefLinkDict

        # Let's find the most number of references for a verse
        mostReferences = totalReferences = 0
        for verseRef, entryList in self.__DataDict.items():
            numRefs = len(entryList)
            if numRefs > mostReferences:
                mostReferences, mostVerseRef = numRefs, verseRef
            totalReferences += numRefs
        print("  {} maximum links for any one reference ({})".format(
            mostReferences, mostVerseRef.getShortText()))
        print("  {} total links for all references".format(totalReferences))

        return self.__DataList, self.__DataDict
コード例 #32
0
    def __validateSystem(self, bookOrderTree, systemName):
        """ Do a semi-automatic check of the XML file validity. """
        assert bookOrderTree

        uniqueDict = {}
        for elementName in self.uniqueElements:
            uniqueDict["Element_" + elementName] = []
        for attributeName in self.uniqueAttributes:
            uniqueDict["Attribute_" + attributeName] = []

        expectedID = 1
        for k, element in enumerate(bookOrderTree):
            if element.tag == self.mainElementTag:
                BibleOrgSysGlobals.checkXMLNoTail(element, element.tag)
                if not self.compulsoryAttributes and not self.optionalAttributes:
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, element.tag)
                if not self.compulsoryElements and not self.optionalElements:
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, element.tag)

                # Check ascending ID field
                ID = element.get("id")
                intID = int(ID)
                if intID != expectedID:
                    logging.error(
                        _("ID numbers out of sequence in record {} (got {} when expecting {}) for {}"
                          ).format(k, intID, expectedID, systemName))
                expectedID += 1

                # Check that this is unique
                if element.text:
                    if element.text in uniqueDict:
                        logging.error(
                            _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {}) for {}"
                              ).format(element.text, element.tag, ID, k,
                                       systemName))
                    uniqueDict[element.text] = None

                # Check compulsory attributes on this main element
                for attributeName in self.compulsoryAttributes:
                    attributeValue = element.get(attributeName)
                    if attributeValue is None:
                        logging.error(
                            _("Compulsory {!r} attribute is missing from {} element in record {}"
                              ).format(attributeName, element.tag, k))
                    if not attributeValue:
                        logging.warning(
                            _("Compulsory {!r} attribute is blank on {} element in record {}"
                              ).format(attributeName, element.tag, k))

                # Check optional attributes on this main element
                for attributeName in self.optionalAttributes:
                    attributeValue = element.get(attributeName)
                    if attributeValue is not None:
                        if not attributeValue:
                            logging.warning(
                                _("Optional {!r} attribute is blank on {} element in record {}"
                                  ).format(attributeName, element.tag, k))

                # Check for unexpected additional attributes on this main element
                for attributeName in element.keys():
                    attributeValue = element.get(attributeName)
                    if attributeName not in self.compulsoryAttributes and attributeName not in self.optionalAttributes:
                        logging.warning(
                            _("Additional {!r} attribute ({!r}) found on {} element in record {}"
                              ).format(attributeName, attributeValue,
                                       element.tag, k))

                # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes)
                for attributeName in self.uniqueAttributes:
                    attributeValue = element.get(attributeName)
                    if attributeValue is not None:
                        if attributeValue in uniqueDict["Attribute_" +
                                                        attributeName]:
                            logging.error(
                                _("Found {!r} data repeated in {!r} field on {} element in record {}"
                                  ).format(attributeValue, attributeName,
                                           element.tag, k))
                        uniqueDict["Attribute_" +
                                   attributeName].append(attributeValue)

                # Check compulsory elements
                for elementName in self.compulsoryElements:
                    if element.find(elementName) is None:
                        logging.error(
                            _("Compulsory {!r} element is missing in record with ID {!r} (record {})"
                              ).format(elementName, ID, k))
                    if not element.find(elementName).text:
                        logging.warning(
                            _("Compulsory {!r} element is blank in record with ID {!r} (record {})"
                              ).format(elementName, ID, k))

                # Check optional elements
                for elementName in self.optionalElements:
                    if element.find(elementName) is not None:
                        if not element.find(elementName).text:
                            logging.warning(
                                _("Optional {!r} element is blank in record with ID {!r} (record {})"
                                  ).format(elementName, ID, k))

                # Check for unexpected additional elements
                for subelement in element:
                    if subelement.tag not in self.compulsoryElements and subelement.tag not in self.optionalElements:
                        logging.warning(
                            _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})"
                              ).format(subelement.tag, subelement.text, ID, k))

                # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements)
                for elementName in self.uniqueElements:
                    if element.find(elementName) is not None:
                        text = element.find(elementName).text
                        if text in uniqueDict["Element_" + elementName]:
                            logging.error(
                                _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})"
                                  ).format(text, elementName, ID, k))
                        uniqueDict["Element_" + elementName].append(text)
            else:
                logging.warning(
                    _("Unexpected element: {} in record {}").format(
                        element.tag, k))
コード例 #33
0
    def __validateAndExtractVerse( self, BBB, chapterNumber, thisBook, verse ):
        """
        Check/validate and extract verse data from the given XML book record
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML verse…") )

        location = "verse in {} {}".format( BBB, chapterNumber )
        BibleOrgSysGlobals.checkXMLNoTail( verse, location, 'l5ks' )

        # Handle verse attributes
        verseNumber = toVerseNumber = None
        for attrib,value in verse.items():
            if attrib=="vnumber":
                verseNumber = value
            else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) )
        if BibleOrgSysGlobals.debugFlag: assert verseNumber
        location = "{}:{}".format( location, verseNumber ) # Get a better location description
        #thisBook.addLine( 'v', verseNumber )
        vText = '' if verse.text is None else verse.text
        if vText: vText = vText.strip()
        #if not vText: # This happens if a verse starts immediately with a style or note
            #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) )

        # Handle verse subelements (notes and styled portions)
        for subelement in verse:
            if subelement.tag == HaggaiXMLBible.noteTag:
                sublocation = "note in " + location
                noteType = None
                for attrib,value in subelement.items():
                    if attrib=="type": noteType = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
                if noteType and noteType not in ('variant',):
                    logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) )
                nText, nTail = subelement.text, subelement.tail
                #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) )
                vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText )
                if nTail:
                    if '\n' in nTail:
                        print( "HaggaiXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) )
                        nTail = nTail.replace( '\n', ' ' )
                    vText += nTail
                for subsubelement in subelement:
                    if subsubelement.tag == HaggaiXMLBible.styleTag:
                        subsublocation = "style in " + sublocation
                        BibleOrgSysGlobals.checkXMLNoSubelements( subsubelement, subsublocation, 'fyt4' )
                        fs = css = idStyle = None
                        for attrib,value in subsubelement.items():
                            if attrib=='fs': fs = value
                            #elif attrib=="css": css = value
                            #elif attrib=="id": idStyle = value
                            else: logging.warning( "Unprocessed {!r} attribute ({}) in style subsubelement".format( attrib, value ) )
                        if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle
                        SFM = None
                        if fs == 'italic': SFM = '\\it'
                        elif fs == 'super': SFM = '\\bdit'
                        elif fs == 'emphasis': SFM = '\\em'
                        else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt
                        #if css == "font-style:italic": SFM = '\\it'
                        #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit'
                        #elif css == "color:#FF0000": SFM = '\\em'
                        #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add'
                        #elif css is None and idStyle=='cl:divineName': SFM = '\\nd'
                        #else: print( "css is", css, "idStyle is", idStyle ); halt
                        sText, sTail = subsubelement.text.strip(), subsubelement.tail
                        if BibleOrgSysGlobals.debugFlag: assert sText
                        if SFM: vText += SFM+' ' + sText + SFM+'*'
                        else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles
                        if sTail: vText += sTail.strip()
                    else: logging.error( "Expected to find {} but got {!r} in {}".format( HaggaiXMLBible.styleTag, subsubelement.tag, sublocation ) )

            elif subelement.tag == HaggaiXMLBible.styleTag:
                sublocation = "style in " + location
                BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' )
                fs = css = idStyle = None
                for attrib,value in subelement.items():
                    if attrib=="fs": fs = value
                    #elif attrib=="css": css = value
                    #elif attrib=="id": idStyle = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert fs
                SFM = None
                if fs == 'super': SFM = '\\bdit'
                elif fs == 'emphasis': SFM = '\\em'
                else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt
                #if css == "font-style:italic": SFM = '\\it'
                #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit'
                #elif css == "color:#FF0000": SFM = '\\em'
                #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add'
                #elif css is None and idStyle=='cl:divineName': SFM = '\\nd'
                #else: print( "css is", css, "idStyle is", idStyle ); halt
                sText, sTail = subelement.text.strip(), subelement.tail
                if BibleOrgSysGlobals.debugFlag: assert sText
                #print( BBB, chapterNumber, sublocation )
                if SFM: vText += SFM+' ' + sText + SFM+'*'
                else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles
                if sTail: vText += sTail.strip()

            elif subelement.tag == HaggaiXMLBible.breakTag:
                sublocation = "line break in " + location
                BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' )
                BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' )
                art = None
                for attrib,value in subelement.items():
                    if attrib=="art":
                        art = value
                    else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
                if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl'
                #print( BBB, chapterNumber, verseNumber )
                #assert vText
                if vText:
                    thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
                    vText = ''
                thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' )
                #bTail = subelement.tail
                #if bTail: vText = bTail.strip()
            else: logging.error( "Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) )

        if vText: # This is the main text of the verse (follows the verse milestone)
            if '\n' in vText:
                print( "HaggaiXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) )
                vText = vText.replace( '\n', ' ' )
            thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
コード例 #34
0
    def load(self):
        """
        Load a single source XML file and load book elements.
        """
        if BibleOrgSysGlobals.verbosityLevel > 2:
            print(_("Loading {}…").format(self.sourceFilepath))
        self.XMLTree = ElementTree().parse(self.sourceFilepath)
        if BibleOrgSysGlobals.debugFlag:
            assert len(
                self.XMLTree)  # Fail here if we didn't load anything at all

        if self.suppliedMetadata is None: self.suppliedMetadata = {}
        self.suppliedMetadata['VerseView'] = {}

        # Find the main (bible) container
        if self.XMLTree.tag == VerseViewXMLBible.treeTag:
            location = "VerseView XML file"
            BibleOrgSysGlobals.checkXMLNoText(self.XMLTree, location, '4f6h')
            BibleOrgSysGlobals.checkXMLNoAttributes(self.XMLTree, location,
                                                    'js24')
            BibleOrgSysGlobals.checkXMLNoTail(self.XMLTree, location, '1wk8')

            # Find the submain (various info and then book) containers
            bookNumber = 0
            for element in self.XMLTree:
                if element.tag == VerseViewXMLBible.filenameTag:
                    sublocation = "filename in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, sublocation, 'jk86')
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, sublocation, 'hjk7')
                    BibleOrgSysGlobals.checkXMLNoTail(element, sublocation,
                                                      'bh09')
                    #self.filename = element.text
                elif element.tag == VerseViewXMLBible.revisionTag:
                    sublocation = "revision in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, sublocation, 'jk86')
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, sublocation, 'hjk7')
                    BibleOrgSysGlobals.checkXMLNoTail(element, sublocation,
                                                      'bh09')
                    self.suppliedMetadata['VerseView'][
                        'Revision'] = element.text
                elif element.tag == VerseViewXMLBible.titleTag:
                    sublocation = "title in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, sublocation, 'jk86')
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, sublocation, 'hjk7')
                    BibleOrgSysGlobals.checkXMLNoTail(element, sublocation,
                                                      'bh09')
                    self.suppliedMetadata['VerseView']['Title'] = element.text
                elif element.tag == VerseViewXMLBible.fontTag:
                    sublocation = "font in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, sublocation, 'jk86')
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, sublocation, 'hjk7')
                    BibleOrgSysGlobals.checkXMLNoTail(element, sublocation,
                                                      'bh09')
                    self.suppliedMetadata['VerseView']['Font'] = element.text
                elif element.tag == VerseViewXMLBible.copyrightTag:
                    sublocation = "copyright in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, sublocation, 'jk86')
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, sublocation, 'hjk7')
                    BibleOrgSysGlobals.checkXMLNoTail(element, sublocation,
                                                      'bh09')
                    self.suppliedMetadata['VerseView'][
                        'Copyright'] = element.text
                elif element.tag == VerseViewXMLBible.sizefactorTag:
                    sublocation = "sizefactor in " + location
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, sublocation, 'jk86')
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, sublocation, 'hjk7')
                    BibleOrgSysGlobals.checkXMLNoTail(element, sublocation,
                                                      'bh09')
                    if BibleOrgSysGlobals.debugFlag: assert element.text == '1'
                elif element.tag == VerseViewXMLBible.bookTag:
                    sublocation = "book in " + location
                    BibleOrgSysGlobals.checkXMLNoText(element, sublocation,
                                                      'g3g5')
                    BibleOrgSysGlobals.checkXMLNoTail(element, sublocation,
                                                      'd3f6')
                    bookNumber += 1
                    self.__validateAndExtractBook(element, bookNumber)
                else:
                    logging.error(
                        "xk15 Expected to find {!r} but got {!r}".format(
                            VerseViewXMLBible.bookTag, element.tag))
        else:
            logging.error("Expected to load {!r} but got {!r}".format(
                VerseViewXMLBible.treeTag, self.XMLTree.tag))

        if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2:
            # These are all compulsory so they should all exist
            #print( "Filename is {!r}".format( self.filename ) )
            print("Revision is {!r}".format(
                self.suppliedMetadata['VerseView']['Revision']))
            print("Title is {!r}".format(
                self.suppliedMetadata['VerseView']['Title']))
            print("Font is {!r}".format(
                self.suppliedMetadata['VerseView']['Font']))
            print("Copyright is {!r}".format(
                self.suppliedMetadata['VerseView']['Copyright']))
            #print( "SizeFactor is {!r}".format( self.sizeFactor ) )

        self.applySuppliedMetadata(
            'VerseView')  # Copy some to self.settingsDict
        self.doPostLoadProcessing()
コード例 #35
0
        def loadParagraph( paragraphXML, paragraphlocation ):
            """
            Load a paragraph from the USX XML.
            In this context, paragraph means heading and intro lines,
                as well as paragraphs of verses.

            Uses (and updates) C,V information from the containing function.
            """
            nonlocal C, V

            # Process the attributes first
            paragraphStyle = None
            for attrib,value in paragraphXML.items():
                if attrib=='style':
                    paragraphStyle = value # This is basically the USFM marker name
                else:
                    logging.warning( _("CH46 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )

            # Now process the paragraph text (or write a paragraph marker anyway)
            paragraphText = paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else ''
            if version is None: paragraphText = paragraphText.rstrip() # Don't need to strip extra spaces in v2
            self.addLine( paragraphStyle, paragraphText )

            # Now process the paragraph subelements
            for element in paragraphXML:
                location = element.tag + ' ' + paragraphlocation
                #print( "USXXMLBibleBook.load", C, V, element.tag, location )
                if element.tag == 'verse': # milestone (not a container)
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    verseStyle = altNumber = None
                    for attrib,value in element.items():
                        if attrib=='number':
                            V = value
                        elif attrib=='style':
                            verseStyle = value
                        elif attrib=='altnumber':
                            altNumber = value
                        else:
                            logging.error( _("KR60 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    if verseStyle != 'v':
                        logging.error( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) )
                    #if altNumber: print( repr(verseStyle), repr(altNumber) ); halt
                    altStuff = ' \\va {}\\va*'.format( altNumber ) if altNumber else ''
                    self.addLine( verseStyle, V + altStuff + ' ' )
                    # Now process the tail (if there's one) which is the verse text
                    if element.tail:
                        vText = element.tail
                        if vText[0]=='\n': vText = vText.lstrip() # Paratext puts cross references on a new line
                        if vText:
                            #print( repr(vText) )
                            self.appendToLastLine( vText )
                elif element.tag == 'char':
                    # Process the attributes first
                    charStyle = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            charStyle = value # This is basically the USFM character marker name
                            #print( "  charStyle", charStyle )
                            assert not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle )
                        else:
                            logging.error( _("QU52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    charLine = "\\{} {} ".format( charStyle, element.text )
                    # Now process the subelements -- chars are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( '{} {}:{} {}'.format( self.BBB, C, V, element.tag ) )
                        if subelement.tag == 'char': # milestone (not a container)
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            subCharStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style': subCharStyle = value
                                elif attrib=='closed':
                                    assert value=='false'
                                    charClosed = False
                                else:
                                    logging.error( _("KS41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            charLine += "\\{} {}".format( subCharStyle, subelement.text )
                            if charClosed: charLine += "\\{}*".format( subCharStyle )
                            #if subelement.tail is not None: print( "  tail1", repr(subelement.tail) )
                            charLine += '' if subelement.tail is None else subelement.tail
                        else:
                            logging.error( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) )
                            self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) )
                    # A character field must be added to the previous field
                    #if element.tail is not None: print( " tail2", repr(element.tail) )
                    charTail = ''
                    if element.tail:
                        charTail = element.tail
                        if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts footnote parts on new lines
                    charLine += "\\{}*{}".format( charStyle, charTail )
                    #if debuggingThisModule: print( "USX.loadParagraph:", C, V, paragraphStyle, charStyle, repr(charLine) )
                    self.appendToLastLine( charLine )
                elif element.tag == 'note':
                    #print( "NOTE", BibleOrgSysGlobals.elementStr( element ) )
                    # Process the attributes first
                    noteStyle = noteCaller = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            noteStyle = value # This is basically the USFM marker name
                            assert noteStyle in ('x','f',)
                        elif attrib=='caller': noteCaller = value # Usually hyphen or a symbol to be used for the note
                        else:
                            logging.error( _("CY38 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    if noteCaller=='' and self.BBB=='NUM' and C=='10' and V=='36': noteCaller = '+' # Hack
                    assert noteStyle and noteCaller # both compulsory
                    noteLine = "\\{} {} ".format( noteStyle, noteCaller )
                    if element.text:
                        noteText = element.text.strip()
                        noteLine += noteText
                    # Now process the subelements -- notes are one of the few multiply embedded fields in USX
                    for subelement in element:
                        sublocation = subelement.tag + ' ' + location
                        #print( C, V, subelement.tag )
                        if subelement.tag == 'char': # milestone (not a container)
                            # Process the attributes first
                            charStyle, charClosed = None, True
                            for attrib,value in subelement.items():
                                if attrib=='style':
                                    charStyle = value
                                elif attrib=='closed':
                                    assert value=='false'
                                    charClosed = False
                                else:
                                    logging.warning( _("GJ67 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            noteLine += "\\{} {}".format( charStyle, subelement.text )
                            # Now process the subelements -- notes are one of the few multiply embedded fields in USX
                            for sub2element in subelement:
                                sub2location = sub2element.tag + ' ' + sublocation
                                #print( C, V, sub2element.tag )
                                if sub2element.tag == 'char': # milestone (not a container)
                                    BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location )
                                    # Process the attributes first
                                    char2Style, char2Closed = None, True
                                    for attrib,value in sub2element.items():
                                        if attrib=='style':
                                            char2Style = value
                                        elif attrib=='closed':
                                            assert value=='false'
                                            char2Closed = False
                                        else:
                                            logging.warning( _("VH36 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) )
                                    assert char2Closed
                                    noteLine += "\\{} {}\\{}*{}".format( char2Style, sub2element.text, char2Style, sub2element.tail if sub2element.tail else '' )
                            if charClosed: noteLine += "\\{}*".format( charStyle )
                            if subelement.tail:
                                charTail = subelement.tail
                                if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts cross reference parts on a new line
                                noteLine += charTail
                        elif subelement.tag == 'unmatched': # Used to denote errors in the source text
                            BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation )
                            BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation )
                            # Process the attributes first
                            unmmatchedMarker = None
                            for attrib,value in subelement.items():
                                if attrib=='marker':
                                    unmmatchedMarker = value
                                else:
                                    logging.warning( _("NV21 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                            self.addPriorityError( 2, C, V, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) )
                        else:
                            logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) )
                            self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) )
                        if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail
                    #noteLine += "\\{}*".format( charStyle )
                    noteLine += "\\{}*".format( noteStyle )
                    if element.tail:
                        #if '\n' in element.tail: halt
                        noteTail = element.tail
                        if noteTail[0]=='\n': noteTail = noteTail.lstrip() # Paratext puts multiple cross-references on new lines
                        noteLine += noteTail
                    #print( "NoteLine", repr(noteLine) )
                    self.appendToLastLine( noteLine )
                elif element.tag == 'link': # Used to include extra resources
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    # Process the attributes first
                    linkStyle = linkDisplay = linkTarget = None
                    for attrib,value in element.items():
                        if attrib=='style':
                            linkStyle = value
                            assert linkStyle in ('jmp',)
                        elif attrib=='display':
                            linkDisplay = value # e.g., "click here"
                        elif attrib=='target':
                            linkTarget = value # e.g., some reference
                        else:
                            logging.warning( _("KW54 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                    self.addPriorityError( 3, C, V, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) )
                elif element.tag == 'unmatched': # Used to denote errors in the source text
                    BibleOrgSysGlobals.checkXMLNoText( element, location )
                    BibleOrgSysGlobals.checkXMLNoTail( element, location )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, location )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, location )
                    self.addPriorityError( 2, C, V, _("Unmatched element in {}").format( location) )
                else:
                    logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, location ) )
                    self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) )
                    for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] )
                    if BibleOrgSysGlobals.debugFlag: halt
コード例 #36
0
    def __validateAndExtractVerse(self, BBB, chapterNumber, thisBook, verse):
        """
        Check/validate and extract verse data from the given XML book record
            finding and saving verse elements.
        """

        if BibleOrgSysGlobals.debugFlag and debuggingThisModule and BibleOrgSysGlobals.verbosityLevel > 3:
            print(_("Validating XML verse…"))

        location = "verse in {} {}".format(BBB, chapterNumber)
        BibleOrgSysGlobals.checkXMLNoSubelements(verse, location, 'sg20')
        BibleOrgSysGlobals.checkXMLNoTail(verse, location, 'l5ks')

        # Handle verse attributes
        verseNumber = toVerseNumber = None
        for attrib, value in verse.items():
            if attrib == "n":
                verseNumber = value
            else:
                logging.warning(
                    "Unprocessed {!r} attribute ({}) in verse element".format(
                        attrib, value))
        if BibleOrgSysGlobals.debugFlag: assert verseNumber
        location = "{}:{}".format(
            location, verseNumber)  # Get a better location description
        #thisBook.addLine( 'v', verseNumber )
        vText = '' if verse.text is None else verse.text
        if vText: vText = vText.strip()
        #if not vText: # This happens if a verse starts immediately with a style or note
        #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) )

        ## Handle verse subelements (notes and styled portions)
        #for subelement in verse:
        #if subelement.tag == VerseViewXMLBible.noteTag:
        #sublocation = "note in " + location
        #noteType = None
        #for attrib,value in subelement.items():
        #if attrib=="type": noteType = value
        #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
        #if noteType and noteType not in ('variant',):
        #logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) )
        #nText, nTail = subelement.text, subelement.tail
        ##print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) )
        #vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText )
        #if nTail:
        #if '\n' in nTail:
        #print( "VerseViewXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) )
        #nTail = nTail.replace( '\n', ' ' )
        #vText += nTail
        #for sub2element in subelement:
        #if sub2element.tag == VerseViewXMLBible.styleTag:
        #sub2location = "style in " + sublocation
        #BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fyt4' )
        #fs = css = idStyle = None
        #for attrib,value in sub2element.items():
        #if attrib=='fs': fs = value
        ##elif attrib=="css": css = value
        ##elif attrib=="id": idStyle = value
        #else: logging.warning( "Unprocessed {!r} attribute ({}) in style sub2element".format( attrib, value ) )
        #if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle
        #SFM = None
        #if fs == 'italic': SFM = '\\it'
        #elif fs == 'super': SFM = '\\bdit'
        #elif fs == 'emphasis': SFM = '\\em'
        #else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt
        ##if css == "font-style:italic": SFM = '\\it'
        ##elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit'
        ##elif css == "color:#FF0000": SFM = '\\em'
        ##elif css == "font-size: x-small; color:#8B8378": SFM = '\\add'
        ##elif css is None and idStyle=='cl:divineName': SFM = '\\nd'
        ##else: print( "css is", css, "idStyle is", idStyle ); halt
        #sText, sTail = sub2element.text.strip(), sub2element.tail
        #if BibleOrgSysGlobals.debugFlag: assert sText
        #if SFM: vText += SFM+' ' + sText + SFM+'*'
        #else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles
        #if sTail: vText += sTail.strip()
        #else: logging.error( "df20 Expected to find {} but got {!r} in {}".format( VerseViewXMLBible.styleTag, sub2element.tag, sublocation ) )

        #elif subelement.tag == VerseViewXMLBible.styleTag:
        #sublocation = "style in " + location
        #BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' )
        #fs = css = idStyle = None
        #for attrib,value in subelement.items():
        #if attrib=="fs": fs = value
        ##elif attrib=="css": css = value
        ##elif attrib=="id": idStyle = value
        #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
        #if BibleOrgSysGlobals.debugFlag: assert fs
        #SFM = None
        #if fs == 'super': SFM = '\\bdit'
        #elif fs == 'emphasis': SFM = '\\em'
        #else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt
        ##if css == "font-style:italic": SFM = '\\it'
        ##elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit'
        ##elif css == "color:#FF0000": SFM = '\\em'
        ##elif css == "font-size: x-small; color:#8B8378": SFM = '\\add'
        ##elif css is None and idStyle=='cl:divineName': SFM = '\\nd'
        ##else: print( "css is", css, "idStyle is", idStyle ); halt
        #sText, sTail = subelement.text.strip(), subelement.tail
        #if BibleOrgSysGlobals.debugFlag: assert sText
        ##print( BBB, chapterNumber, sublocation )
        #if SFM: vText += SFM+' ' + sText + SFM+'*'
        #else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles
        #if sTail: vText += sTail.strip()

        #elif subelement.tag == VerseViewXMLBible.breakTag:
        #sublocation = "line break in " + location
        #BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' )
        #BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' )
        #art = None
        #for attrib,value in subelement.items():
        #if attrib=="art":
        #art = value
        #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) )
        #if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl'
        ##print( BBB, chapterNumber, verseNumber )
        ##assert vText
        #if vText:
        #thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
        #vText = ''
        #thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' )
        ##bTail = subelement.tail
        ##if bTail: vText = bTail.strip()
        #else: logging.error( "bd47 Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) )

        if vText:  # This is the main text of the verse (follows the verse milestone)
            if '\n' in vText:
                print(
                    "VerseViewXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}"
                    .format(BBB, chapterNumber, verseNumber, vText))
                vText = vText.replace('\n', ' ')
            thisBook.addLine('v', verseNumber + ' ' + vText)
            verseNumber = None
コード例 #37
0
    def __validate(self):
        """
        Check/validate the loaded data.
        """
        assert (self._XMLtree)

        uniqueDict = {}
        for elementName in self._uniqueElements:
            uniqueDict["Element_" + elementName] = []
        for attributeName in self._uniqueAttributes:
            uniqueDict["Attribute_" + attributeName] = []

        expectedID = 1
        for j, element in enumerate(self._XMLtree):
            if element.tag == self._mainElementTag:
                BibleOrgSysGlobals.checkXMLNoText(element, element.tag)
                BibleOrgSysGlobals.checkXMLNoTail(element, element.tag)
                if not self._compulsoryAttributes and not self._optionalAttributes:
                    BibleOrgSysGlobals.checkXMLNoAttributes(
                        element, element.tag)
                if not self._compulsoryElements and not self._optionalElements:
                    BibleOrgSysGlobals.checkXMLNoSubelements(
                        element, element.tag)

                # Check compulsory attributes on this main element
                for attributeName in self._compulsoryAttributes:
                    attributeValue = element.get(attributeName)
                    if attributeValue is None:
                        logging.error(
                            _("Compulsory {!r} attribute is missing from {} element in record {}"
                              ).format(attributeName, element.tag, j))
                    if not attributeValue:
                        logging.warning(
                            _("Compulsory {!r} attribute is blank on {} element in record {}"
                              ).format(attributeName, element.tag, j))

                # Check optional attributes on this main element
                for attributeName in self._optionalAttributes:
                    attributeValue = element.get(attributeName)
                    if attributeValue is not None:
                        if not attributeValue:
                            logging.warning(
                                _("Optional {!r} attribute is blank on {} element in record {}"
                                  ).format(attributeName, element.tag, j))

                # Check for unexpected additional attributes on this main element
                for attributeName in element.keys():
                    attributeValue = element.get(attributeName)
                    if attributeName not in self._compulsoryAttributes and attributeName not in self._optionalAttributes:
                        logging.warning(
                            _("Additional {!r} attribute ({!r}) found on {} element in record {}"
                              ).format(attributeName, attributeValue,
                                       element.tag, j))

                # Check the attributes that must contain unique information (in that particular field -- doesn't check across different attributes)
                for attributeName in self._uniqueAttributes:
                    attributeValue = element.get(attributeName)
                    if attributeValue is not None:
                        if attributeValue in uniqueDict["Attribute_" +
                                                        attributeName]:
                            logging.error(
                                _("Found {!r} data repeated in {!r} field on {} element in record {}"
                                  ).format(attributeValue, attributeName,
                                           element.tag, j))
                        uniqueDict["Attribute_" +
                                   attributeName].append(attributeValue)

                # Get the sourceComponent to use as a record ID
                ID = element.find("sourceComponent").text

                # Check compulsory elements
                for elementName in self._compulsoryElements:
                    foundElement = element.find(elementName)
                    if foundElement is None:
                        logging.error(
                            _("Compulsory {!r} element is missing in record with ID {!r} (record {})"
                              ).format(elementName, ID, j))
                    else:
                        BibleOrgSysGlobals.checkXMLNoTail(
                            foundElement,
                            foundElement.tag + " in " + element.tag)
                        BibleOrgSysGlobals.checkXMLNoAttributes(
                            foundElement,
                            foundElement.tag + " in " + element.tag)
                        #BibleOrgSysGlobals.checkXMLNoSubelements( foundElement, foundElement.tag + " in " + element.tag )
                        if not foundElement.text:
                            logging.warning(
                                _("Compulsory {!r} element is blank in record with ID {!r} (record {})"
                                  ).format(elementName, ID, j))

                # Check optional elements
                for elementName in self._optionalElements:
                    foundElement = element.find(elementName)
                    if foundElement is not None:
                        BibleOrgSysGlobals.checkXMLNoTail(
                            foundElement,
                            foundElement.tag + " in " + element.tag)
                        BibleOrgSysGlobals.checkXMLNoAttributes(
                            foundElement,
                            foundElement.tag + " in " + element.tag)
                        BibleOrgSysGlobals.checkXMLNoSubelements(
                            foundElement,
                            foundElement.tag + " in " + element.tag)
                        if not foundElement.text:
                            logging.warning(
                                _("Optional {!r} element is blank in record with ID {!r} (record {})"
                                  ).format(elementName, ID, j))

                # Check for unexpected additional elements
                for subelement in element:
                    if subelement.tag not in self._compulsoryElements and subelement.tag not in self._optionalElements:
                        logging.warning(
                            _("Additional {!r} element ({!r}) found in record with ID {!r} (record {})"
                              ).format(subelement.tag, subelement.text, ID, j))

                # Check the elements that must contain unique information (in that particular element -- doesn't check across different elements)
                for elementName in self._uniqueElements:
                    if element.find(elementName) is not None:
                        text = element.find(elementName).text
                        if text in uniqueDict["Element_" + elementName]:
                            logging.error(
                                _("Found {!r} data repeated in {!r} element in record with ID {!r} (record {})"
                                  ).format(text, elementName, ID, j))
                        uniqueDict["Element_" + elementName].append(text)
            else:
                logging.warning(
                    _("Unexpected element: {} in record {}").format(
                        element.tag, j))
            if element.tail is not None and element.tail.strip():
                logging.error(
                    _("Unexpected {!r} tail data after {} element in record {}"
                      ).format(element.tail, element.tag, j))
        if self._XMLtree.tail is not None and self._XMLtree.tail.strip():
            logging.error(
                _("Unexpected {!r} tail data after {} element").format(
                    self._XMLtree.tail, self._XMLtree.tag))
コード例 #38
0
ファイル: USFXXMLBible.py プロジェクト: alerque/BibleOrgSys
    def load( self ):
        """
        Load the XML data file -- we should already know the filepath.
        """
        if BibleOrgSysGlobals.verbosityLevel > 1:
            print( _("USFXXMLBible: Loading {} from {}...").format( self.name, self.sourceFolder ) )

                                #if BibleOrgSysGlobals.verbosityLevel > 2: print( _("  It seems we have {}...").format( BBB ) )
                        #self.thisBook = BibleBook( self, BBB )
                        #self.thisBook.objectNameString = "OSIS XML Bible Book object"
                        #self.thisBook.objectTypeString = "OSIS"
                        #self.haveBook = True

        try: self.tree = ElementTree().parse( self.sourceFilepath )
        except ParseError:
            errorString = sys.exc_info()[1]
            logging.critical( "USFXXMLBible.load: failed loading the xml file {}: {!r}.".format( self.sourceFilepath, errorString ) )
            return
        if BibleOrgSysGlobals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all

        # Find the main (osis) container
        if self.tree.tag == 'usfx':
            location = "USFX file"
            BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' )
            BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' )
            # Process the attributes first
            self.schemaLocation = None
            for attrib,value in self.tree.items():
                #print( "attrib", repr(attrib), repr(value) )
                if attrib.endswith("SchemaLocation"):
                    self.schemaLocation = value
                else:
                    logging.warning( "fv6g Unprocessed {} attribute ({}) in {}".format( attrib, value, location ) )
            BBB = C = V = None
            for element in self.tree:
                #print( "element", repr(element.tag) )
                sublocation = element.tag + " " + location
                if element.tag == 'languageCode':
                    self.languageCode = element.text
                    BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'cff3' )
                    BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'des1' )
                    BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'dwf2' )
                elif element.tag == 'book':
                    self.loadBook( element )
                    ##BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '54f2' )
                    #BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'hd35' )
                    ## Process the attributes
                    #idField = bookStyle = None
                    #for attrib,value in element.items():
                        #if attrib=='id' or attrib=='code':
                            #idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode)
                            ##if idField != BBB:
                            ##    logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) )
                        #elif attrib=='style':
                            #bookStyle = value
                        #else:
                            #logging.warning( _("gfw2 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) )
                else:
                    logging.warning( _("dbw1 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, sublocation ) )
                    #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )

        if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here
            if BibleOrgSysGlobals.verbosityLevel > 2:
                print( "USFXXMLBible.load: Didn't find any regularly named USFX files in {!r}".format( self.sourceFolder ) )
            for thisFilename in foundFiles:
                # Look for BBB in the ID line (which should be the first line in a USFX file)
                isUSFX = False
                thisPath = os.path.join( self.sourceFolder, thisFilename )
                with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done
                    for line in possibleUSXFile:
                        if line.startswith( '\\id ' ):
                            USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id
                            if BibleOrgSysGlobals.verbosityLevel > 2: print( "Have possible USFX ID {!r}".format( USXId ) )
                            BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( USXId )
                            if BibleOrgSysGlobals.verbosityLevel > 2: print( "BBB is {!r}".format( BBB ) )
                            isUSFX = True
                        break # We only look at the first line
                if isUSFX:
                    UBB = USFXXMLBibleBook( self, BBB )
                    UBB.load( self.sourceFolder, thisFilename, self.encoding )
                    UBB.validateMarkers()
                    print( UBB )
                    self.books[BBB] = UBB
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)
            if self.books: print( "USFXXMLBible.load: Found {} irregularly named USFX files".format( len(self.books) ) )
        self.doPostLoadProcessing()
コード例 #39
0
    def validateEntry( self, entry ):
        """
        Check/validate the given Strongs Greek lexicon entry.
        """
        if BibleOrgSysGlobals.debugFlag: assert entry.tag == "entry"
        BibleOrgSysGlobals.checkXMLNoText( entry, entry.tag, "na19" )
        BibleOrgSysGlobals.checkXMLNoTail( entry, entry.tag, "kaq9" )

        # Process the entry attributes first
        strongs5 = None
        for attrib,value in entry.items():
            if attrib ==  "strongs":
                strongs5 = value
                if BibleOrgSysGlobals.verbosityLevel > 2: print( "Validating {} entry…".format( strongs5 ) )
            else: logging.warning( "Unprocessed {!r} attribute ({}) in main entry element".format( attrib, value ) )
        if BibleOrgSysGlobals.debugFlag: assert len(strongs5)==5 and strongs5.isdigit()

        entryResults = {}
        entryString = ""
        gettingEssentials = True
        for j, element in enumerate( entry ):
            #print( strongs5, j, element.tag, repr(entryString) )
            if element.tag == "strongs":
                if BibleOrgSysGlobals.debugFlag: assert gettingEssentials and j==0 and element.text
                BibleOrgSysGlobals.checkXMLNoAttributes( element, element.tag, "md3d" )
                if strongs5!='02717' and (3203 > int(strongs5) > 3302):
                    BibleOrgSysGlobals.checkXMLNoTail( element, element.tag, "f3g7" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, element.tag, "m56g" )
                strongs = element.text
                if BibleOrgSysGlobals.debugFlag: assert strongs5.endswith( strongs )
                if element.tail and element.tail.strip(): entryString += element.tail.strip()
            elif element.tag == "greek":
                location = "greek in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoText( element, location, "jke0" )
                #BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "df35" )
                # Process the attributes
                translit = greek = beta = None
                for attrib,value in element.items():
                    if attrib=="translit": translit = value
                    elif attrib=="unicode": greek = value
                    elif attrib=="BETA": beta = value
                    else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) )
                if BibleOrgSysGlobals.debugFlag: assert greek and translit and beta
                if 'word' not in entryResults: # This is the first/main entry
                    if BibleOrgSysGlobals.debugFlag: assert gettingEssentials and j==1
                    BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" )
                    entryResults['word'] = (greek, translit, beta)
                else:
                    #print( "Have multiple greek entries in " + strongs5 )
                    if BibleOrgSysGlobals.debugFlag: assert j > 2
                    gettingEssentials = False
                    entryString += ' ' + BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ) #.replace( '\n', '' )
            elif element.tag == "pronunciation":
                location = "pronunciation in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoText( element, location, "iw9k" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "0s20" )
                # Process the attributes
                pronunciation = None
                for attrib,value in element.items():
                    if attrib=="strongs": pronunciation = value
                    else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) )
                if gettingEssentials:
                    #BibleOrgSysGlobals.checkXMLNoTail( element, location, "kd02" )
                    if BibleOrgSysGlobals.debugFlag:
                        assert j == 2
                        assert pronunciation
                        assert 'pronunciation' not in entryResults
                    entryResults['pronunciation'] = pronunciation
                else:
                    if BibleOrgSysGlobals.debugFlag: assert j>2 and not gettingEssentials
                    if element.tail and element.tail.strip(): entryString += element.tail.strip().replace( '\n', '' )
            elif element.tag == "strongs_derivation":
                location = "strongs_derivation in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, "ks24" )
                derivation = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' )
                #print( strongs5, "derivation", repr(derivation) )
                if BibleOrgSysGlobals.debugFlag:
                    assert derivation and '\t' not in derivation and '\n' not in derivation
                entryString +=  derivation
            elif element.tag == "strongs_def":
                location = "strongs_def in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, "jd28" )
                definition = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' )
                #print( strongs5, "definition", repr(definition) )
                if BibleOrgSysGlobals.debugFlag:
                    assert definition and '\t' not in definition and '\n' not in definition
                entryString += definition
            elif element.tag == "kjv_def":
                location = "kjv_def in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, "jke0" )
                #BibleOrgSysGlobals.checkXMLNoTail( element, location, "8s2s" )
                #BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "dvb2" )
                KJVdefinition = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' )
                #print( strongs5, "KJVdefinition", repr(KJVdefinition), repr(entryString) )
                if BibleOrgSysGlobals.debugFlag: assert KJVdefinition and '\t' not in KJVdefinition and '\n' not in KJVdefinition
                entryString += KJVdefinition
            elif element.tag == "strongsref":
                location = "strongsref in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoText( element, location, "kls2" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "ks24" )
                strongsRef = BibleOrgSysGlobals.getFlattenedXML( element, strongs5 ).replace( '\n', '' )
                if BibleOrgSysGlobals.debugFlag:
                    assert strongsRef and '\t' not in strongsRef and '\n' not in strongsRef
                strongsRef = re.sub( '<language="GREEK" strongs="(\d{1,5})">', r'<StrongsRef>G\1</StrongsRef>', strongsRef )
                strongsRef = re.sub( '<strongs="(\d{1,5})" language="GREEK">', r'<StrongsRef>G\1</StrongsRef>', strongsRef )
                #strongsRef = re.sub( '<language="HEBREW" strongs="(\d{1,5})">', r'<StrongsRef>H\1</StrongsRef>', strongsRef )
                #strongsRef = re.sub( '<strongs="(\d{1,5})" language="HEBREW">', r'<StrongsRef>H\1</StrongsRef>', strongsRef )
                #print( strongs5, "strongsRef", repr(strongsRef) )
                entryString += ' ' + strongsRef
            elif element.tag == "see":
                location = "see in Strongs " + strongs5
                BibleOrgSysGlobals.checkXMLNoText( element, location, "iw9k" )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, "kd02" )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, "0s20" )
                # Process the attributes
                seeLanguage = seeStrongsNumber = None
                for attrib,value in element.items():
                    if attrib == "language": seeLanguage = value
                    elif attrib == "strongs": seeStrongsNumber = value # Note: No leading zeroes here
                    else: logging.warning( "scs4 Unprocessed {!r} attribute ({}) in {}".format( attrib, value, location ) )
                if BibleOrgSysGlobals.debugFlag:
                    assert seeLanguage and seeStrongsNumber and seeStrongsNumber.isdigit()
                    assert seeLanguage in ('GREEK','HEBREW',)
                if 'see' not in entryResults: entryResults['see'] = []
                entryResults['see'].append( ('G' if seeLanguage=='GREEK' else 'H') + seeStrongsNumber )
            else: logging.error( "2d4f Unprocessed {!r} element ({}) in entry".format( element.tag, element.text ) )

        if entryString:
            #print( strongs5, "entryString", repr(entryString) )
            if BibleOrgSysGlobals.debugFlag:
                assert '\t' not in entryString and '\n' not in entryString
            entryString = re.sub( '<strongsref language="GREEK" strongs="(\d{1,5})"></strongsref>',
                                r'<StrongsRef>G\1</StrongsRef>', entryString )
            entryString = re.sub( '<strongsref strongs="(\d{1,5})" language="GREEK"></strongsref>',
                                r'<StrongsRef>G\1</StrongsRef>', entryString )
            entryString = re.sub( '<strongsref language="HEBREW" strongs="(\d{1,5})"></strongsref>',
                                r'<StrongsRef>H\1</StrongsRef>', entryString )
            entryString = re.sub( '<strongsref strongs="(\d{1,5})" language="HEBREW"></strongsref>',
                                r'<StrongsRef>H\1</StrongsRef>', entryString )
            if BibleOrgSysGlobals.debugFlag:
                assert 'strongsref' not in entryString
            entryResults['Entry'] = entryString
        #print( "entryResults", entryResults )
        self.StrongsEntries[strongs] = entryResults
コード例 #40
0
ファイル: USFXXMLBible.py プロジェクト: alerque/BibleOrgSys
    def loadBook( self, bookElement ):
        """
        Load the book container from the XML data file.
        """
        if BibleOrgSysGlobals.verbosityLevel > 3:
            print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( self.name, self.sourceFolder ) )
        assert( bookElement.tag == 'book' )
        mainLocation = self.name + " USFX book"

        # Process the attributes first
        bookCode = None
        for attrib,value in bookElement.items():
            if attrib == 'id':
                bookCode = value
            else:
                logging.warning( "bce3 Unprocessed {} attribute ({}) in {}".format( attrib, value, mainLocation ) )
        BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( bookCode )
        mainLocation = "{} USFX {} book".format( self.name, BBB )
        if BibleOrgSysGlobals.verbosityLevel > 2:
            print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( BBB, self.name ) )
        BibleOrgSysGlobals.checkXMLNoText( self.tree, mainLocation, '4f6h' )
        BibleOrgSysGlobals.checkXMLNoTail( self.tree, mainLocation, '1wk8' )

        # Now create our actual book
        self.thisBook = BibleBook( self, BBB )
        self.thisBook.objectNameString = "USFX XML Bible Book object"
        self.thisBook.objectTypeString = "USFX"

        C = V = '0'
        for element in bookElement:
            #print( "element", repr(element.tag) )
            location = "{} of {} {}:{}".format( element.tag, mainLocation, BBB, C, V )
            if element.tag == 'id':
                idText = clean( element.text )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'vsg3' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ksq2' )
                for attrib,value in element.items():
                    if attrib == 'id':
                        assert( value == bookCode )
                    else:
                        logging.warning( _("vsg4 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.addLine( 'id', bookCode + ((' '+idText) if idText else '') )
            elif element.tag == 'ide':
                ideText = clean( element.text )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'jsa0' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ls01' )
                charset = None
                for attrib,value in element.items():
                    if attrib == 'charset': charset = value
                    else:
                        logging.warning( _("jx53 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.addLine( 'ide', charset + ((' '+ideText) if ideText else '') )
            elif element.tag == 'h':
                hText = element.text
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'dj35' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'hs35' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'hs32' )
                self.thisBook.addLine( 'h', clean(hText) )
            elif element.tag == 'toc':
                tocText = element.text
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ss13' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'js13' )
                level = None
                for attrib,value in element.items():
                    if attrib == 'level': # Seems compulsory
                        level = value
                    else:
                        logging.warning( _("dg36 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.addLine( 'toc'+level, clean(tocText) )
            elif element.tag == 'c':
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'ks35' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'gs35' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kdr3' ) # This is a milestone
                for attrib,value in element.items():
                    if attrib == 'id':
                        C, V = value, '0'
                    else:
                        logging.warning( _("hj52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                self.thisBook.addLine( 'c', C )
            elif element.tag == 's':
                sText = clean( element.text )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'wxg0' )
                level = None
                for attrib,value in element.items():
                    if attrib == 'level': # Seems optional
                        level = value
                    else:
                        logging.warning( _("bdy6 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                marker = 's'
                if level: marker += level
                self.thisBook.addLine( marker, sText )
                for subelement in element:
                    #print( "subelement", repr(subelement.tag) )
                    sublocation = subelement.tag + " of " + location
                    if subelement.tag == 'f':
                        self.loadFootnote( subelement, sublocation, BBB, C, V )
                    elif subelement.tag == 'x':
                        self.loadCrossreference( subelement, sublocation )
                    elif subelement.tag == 'fig':
                        self.loadFigure( subelement, sublocation )
                    elif subelement.tag == 'table':
                        self.loadTable( subelement, sublocation )
                    elif subelement.tag in ('add','it','bd','bdit','sc',):
                        self.loadCharacterFormatting( subelement, sublocation, BBB, C, V )
                    elif subelement.tag == 'optionalLineBreak':
                        print( "What is loadBook optionalLineBreak?" )
                    else:
                        logging.warning( _("jx9q Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, BBB, C, V, sublocation ) )
            elif element.tag in ('p','q','d',):
                V = self.loadParagraph( element, location, BBB, C )
            elif element.tag == 'b':
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'ks35' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'gs35' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'nd04' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kdr3' )
                self.thisBook.addLine( 'b', '' )
            elif element.tag in ('cl','cp'): # Simple single-line paragraph-level markers
                marker, text = element.tag, clean(element.text)
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'od01' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'gd92' )
                idField = None
                for attrib,value in element.items():
                    if attrib == 'id': idField = value
                    else:
                        logging.warning( _("dv35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) )
                if idField and text is None:
                    text = idField
                else:
                    logging.warning( _("dve4 Unprocessed idField ({}) in {}").format( idField, location ) )
                if text is None:
                    logging.critical( "Why is {} empty at {}".format( marker, location ) )
                assert( text is not None )
                self.thisBook.addLine( marker, text )
            elif element.tag == 'table':
                self.loadTable( element, location )
            elif element.tag == 've': # What's this in Psalms: <c id="4" /><ve /><d>For the Chief Musician; on stringed instruments. A Psalm of David.</d>
                BibleOrgSysGlobals.checkXMLNoText( element, location, 'kds3' )
                BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ks29' )
                BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'kj24' )
                BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'js91' )
                #self.thisBook.addLine( 'b', '' )
                if BibleOrgSysGlobals.verbosityLevel > 2: print( "Ignoring 've' field", BBB, C, V )
            else:
                logging.critical( _("caf2 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, location ) )
                #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) )
                if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt
        self.saveBook( self.thisBook )