def __init__(self, htmlWriter, context): self._breakCount = 0 # Used to prevent too many succeive line breaks self._context = context self._defaultHeaderLevel = 1 self._figHtml = '' # Html generated for a figure self._headerProcessed = False # OSIS header has been fully processed self._hiHtmlTag = ['','',''] # Records tags used for up to 3 levels of nested <hi> tags self._hiLevel = 0 # Number of currently nested <hi> tags self._htmlWriter = htmlWriter self._ignoreTitle = False # Title currently being processed should be ignored self._inCaption = False # Currently processing a figure caption self._ignoreText = False # Text encountered should currently be ignored self._inFootnote = False # Currently processing a footnote self._inGeneratedPara = False # Currently within an html paragraph which does not correspond to an OSIS paragraph self._inGlossaryRef = False # Currently processing a reference to a glossary entry self._inHeader = False # Currently processing OSIS header self._inParagraph = False # Currently within an html paragraph which corresponds to an OSIS paragraph self._inTable = False # Currently processing contents of a table self._inTitle = False # Currently processing a title self._inWork = False # Currently within a <work> tag self._lineGroupPara = False # A <div> tag has been written for the current line group self._osisFound = False # The <osis> tag has been found self._osisIDWork = None # Value of osisIDwork attribute in <osisText> tag self._osisTextFound = False # The <osisText> tag has been found self._suppressBreaks = False # Set to prevent <br /> tags being written self._titleTag = '' # Opening tag html for title currently being processed self._titleText = '' # Text of title currently being processed self._writingFootnoteMarker = False # Currently writing marker for a footnote self._workId = '' # Value of osisWork attribute in <work> tag self._footnotes = BookFootnotes(htmlWriter, self._context.config.epub3)
class OsisHandler(handler.ContentHandler): def __init__(self, htmlWriter, context): self._breakCount = 0 # Used to prevent too many succeive line breaks self._context = context self._defaultHeaderLevel = 1 self._figHtml = '' # Html generated for a figure self._headerProcessed = False # OSIS header has been fully processed self._hiHtmlTag = ['','',''] # Records tags used for up to 3 levels of nested <hi> tags self._hiLevel = 0 # Number of currently nested <hi> tags self._htmlWriter = htmlWriter self._ignoreTitle = False # Title currently being processed should be ignored self._inCaption = False # Currently processing a figure caption self._ignoreText = False # Text encountered should currently be ignored self._inFootnote = False # Currently processing a footnote self._inGeneratedPara = False # Currently within an html paragraph which does not correspond to an OSIS paragraph self._inGlossaryRef = False # Currently processing a reference to a glossary entry self._inHeader = False # Currently processing OSIS header self._inParagraph = False # Currently within an html paragraph which corresponds to an OSIS paragraph self._inTable = False # Currently processing contents of a table self._inTitle = False # Currently processing a title self._inWork = False # Currently within a <work> tag self._lineGroupPara = False # A <div> tag has been written for the current line group self._osisFound = False # The <osis> tag has been found self._osisIDWork = None # Value of osisIDwork attribute in <osisText> tag self._osisTextFound = False # The <osisText> tag has been found self._suppressBreaks = False # Set to prevent <br /> tags being written self._titleTag = '' # Opening tag html for title currently being processed self._titleText = '' # Text of title currently being processed self._writingFootnoteMarker = False # Currently writing marker for a footnote self._workId = '' # Value of osisWork attribute in <work> tag self._footnotes = BookFootnotes(htmlWriter, self._context.config.epub3) def startDocument(self): self._breakCount = 0 self._figHtml = '' self._headerProcessed = False self._hiHtmlTag = ['','',''] self._hiLevel = 0 self._inCaption = False self._ignoreText = False self._inFootnote = False self._inGeneratedPara = False self._inGlossaryRef = False self._inHeader = False self._inParagraph = False self._inTable = False self._inTitle = False self._inWork = False self._lineGroupPara = False self._osisFound = False self._osisTextFound = False self._suppressBreaks = False self._titleText = '' def endDocument(self): self._htmlWriter.close() def startElement(self, name, attrs): if self._headerProcessed: self._processBodyTag(name, attrs) elif not self._osisFound: if name == 'osis' or name == 'osis:osis' : self._osisFound = True else: raise OsisError('osis tag not found') elif not self._osisTextFound: if name == 'osisText': self._osisTextFound = True self._osisIDWork = self._getAttributeValue(attrs, 'osisIDWork') if self._context.lang == '': lang = self._getAttributeValue(attrs, 'xml:lang') if (lang is not None and lang != 'und'): self._context.lang = lang else: raise OsisError('osisText tag not found') elif not self._inHeader: if name == 'header': self._inHeader = True elif name == 'work': self._workId = self._getAttributeValue(attrs, 'osisWork') if self._workId is not None: self._inWork = True elif self._inWork: if self._workId == self._osisIDWork: if name == 'title': self._inTitle = True else: if name == 'type': workType = self._getAttributeValue(attrs, 'type') if workType == 'x-glossary': self._context.glossaries.append(self._workId) def endElement(self, name): if name == 'caption': if self._inCaption: self._inCaption = False self._figHtml += '</figcaption>\n' elif name == 'catchWord': self._writeHtml('</i>') elif name == 'cell': self._writeHtml('</td>') elif name == 'figure': if self._figHtml != '': self._figHtml += '</figure>\n' self._writeHtml(self._figHtml) self._figHtml = '' elif name == 'foreign': self._writeHtml('</span>') elif name == 'head': self._writeHtml('</div>') elif name == 'header': if self._inHeader: self._inHeader = False self._headerProcessed = True else: raise OsisError('unexpected end of header') elif name == 'hi': self._hiLevel -= 1 if self._hiHtmlTag[self._hiLevel] != '': self._writeHtml('</%s>' % self._hiHtmlTag[self._hiLevel]) self._hiHtmlTag[self._hiLevel] = '' elif name == 'item': self._writeHtml('</li>\n') elif name == 'l': self._writeHtml('</div>\n') self._breakCount = 1 elif name == 'lg': if self._lineGroupPara: self._writeHtml('</div>\n') self._lineGroupPara = False elif name == 'list': self._writeHtml('</ul>\n') elif name == 'note': if self._inFootnote: self._inFootnote = False self._footnotes.footnoteComplete() else: self._ignoreText = False elif name == 'p': if self._inParagraph: self._writeHtml('</p>\n') self._breakCount = 1 self._inParagraph = False elif name == 'rdg': self._writeHtml('</span>') elif name == 'reference': if self._inGlossaryRef: if self._context.outputFmt == 'fb2': # For FB2, <span> not effective, so use markers to be picked up by postprocessor self._writeHtml('%%%') else: self._writeHtml('</span>') self._inGlossaryRef = False elif self._figHtml != '': self._ignoreText = False elif name == 'row': self._writeHtml('</tr>\n') elif name == 'title': if self._inTitle: self._inTitle = False if self._ignoreTitle: self._ignoreTitle = False elif self._headerProcessed: self._writeTitle() elif name == 'table': self._writeHtml('</table>\n') self._inTable = False elif name == 'transChange': self._writeHtml('</span>') elif name == 'work': self._inWork = False def characters(self, content): # This is default handling, which will usually be overridden text = content.strip() if self._headerProcessed and len(text) > 0: self._checkGeneratePara() self._writeHtml(content) def _getAttributeValue(self, attrs, attrName): for (name, value) in attrs.items(): if name == attrName: return value return None def _processBodyTag(self, name, attrs): if name == 'caption': if self._figHtml != '': self._figHtml += '<figcaption>' self._inCaption = True else: print 'Caption not associated with a figure' elif name == 'catchWord': self._writeHtml('<i>') elif name == 'cell': self._writeHtml('<td>') elif name == 'figure': source = self._getAttributeValue(attrs, 'src') # Assume that a TIFF input file has been converted to JPG source = source.replace('.tiff', '.jpg') source = source.replace('.tif', '.jpg') # If the image file spec starts with "images/", remove it, # as this is already in _context.config.imgFileDir source = re.sub('^(\./|/)?images/','',source) # Copy the image file to the current directory fullFileSpec = self._context.config.imgFileDir + '/' + source try: shutil.copy(fullFileSpec, '.') # Set up the html self._figHtml = '<figure>\n<img src="%s" />\n' % source # Add the image file to the list if source not in self._context.imageFiles: self._context.imageFiles.append(source) except IOError: print 'Figure omitted: image file %s not found' % fullFileSpec except UnicodeEncodeError: print 'Figure omitted: invalid image file name' elif name == 'foreign': self._writeHtml('<span class="foreign">') elif name == 'head': self._writeHtml('<div class="heading">') elif name == 'hi': self._handleHi(attrs) elif name == 'index': # <index> tags are ignored pass elif name == 'item': itemType = self._getAttributeValue(attrs, 'type') itemSubType = self._getAttributeValue(attrs, 'subType') itemClass = '' if itemType is not None: itemClass = itemType if itemSubType is not None: itemClass += ' ' itemClass += itemSubType elif itemSubType is not None: itemClass = itemSubType tag = '<li>' if itemClass != '': tag = '<li class="%s">' % itemClass self._writeHtml(tag) elif name == 'lb': breakType = self._getAttributeValue(attrs, 'type') if breakType != 'x-optional' or self._context.config.optionalBreaks: self._writeBreak(False) else: self._writeHtml(' ') elif name == 'l': htmlTag = self._lineHtml(attrs) self._writeHtml(htmlTag) elif name == 'lg': if not self._inParagraph and not self._inGeneratedPara: self._writeHtml('<div>') self._lineGroupPara = True else: self._writeBreak(True) elif name == 'list': listType = self._getAttributeValue(attrs, 'subType') if listType is None: htmlTag = '<ul>' else: htmlTag = '<ul class="%s">' % listType self._writeHtml(htmlTag) elif name == 'milestone': # <milestone> tags are ignored pass elif name == 'name': # <name> tags are ignored pass elif name == 'note': noteType = self._getAttributeValue(attrs, 'type') notePlace = self._getAttributeValue(attrs, 'placement') if noteType == 'study' or notePlace == 'foot': # This type of note is a footnote self._startFootnote(attrs) else: # Ignore other types of note (generally cross-references) self._ignoreText = True elif name == 'p': self._endGeneratedPara() paraTag = self._generateParaTag(attrs) self._inParagraph = True self._writeHtml(paraTag) elif name == 'rdg': self._writeHtml('<span class="alt-var">') elif name == 'reference': self._processReference(attrs) elif name == 'row': self._writeHtml('<tr>') elif name == 'seg': # <seg> tags are normally ignored (may be overridden in subclass) pass elif name == 'table': self._writeHtml('<table>\n') self._inTable = True elif name == 'title': if titleType == 'runningHead': self._inTitle = True self._ignoreTitle = True else: level = self._getAttributeValue(attrs,'level') if level is not None: headerLevel = level else: headerLevel = self._defaultHeaderLevel subType = self._getAttributeValue(attrs,'subType') if subType is not None: self._titleTag = '<h%d class="%s">' % (headerLevel, subType) else: self._titleTag = '<h%d>' % (headerLevel) self._inTitle = True self._titleText = '' elif name == 'transChange': self._writeHtml('<span class="transChange">') else: self._context.unexpectedTag(name) def _writeHtml(self, html): self._suppressBreaks = False if self._inFootnote and not self._writingFootnoteMarker: self._footnotes.addFootnoteText(html) elif self._inTitle: self._titleText += html elif self._inCaption: self._figHtml += html else: self._htmlWriter.write(html) self._breakCount = 0 # will be overwritten if called from _writeBreak() def _writeBreak(self, newline): if not self._suppressBreaks and self._breakCount < 2: storedCount = self._breakCount self._writeHtml('<br />') if newline: self._writeHtml('\n') self._breakCount = storedCount + 1 def _writeTitle(self): if len(self._titleText) > 0: self._writeBreak(False) self._writeHtml(self._titleTag) self._writeHtml(self._titleText) closingTag = '</h%s><br />\n' % self._titleTag[2] self._writeHtml(closingTag) self._suppressBreaks = True self._breakCount = 2 return True else: return False def _startGeneratedPara(self): print 'Generating para' paraTag = '<p class="x-indent-0">' self._writeHtml(paraTag) self._inGeneratedPara = True def _endGeneratedPara(self): if self._inGeneratedPara: self._writeHtml('</p>') self._inGeneratedPara = False def _closeParagraph(self): if self._inParagraph: self._writeHtml('</p>') self._inParagraph = False else: self._endGeneratedPara() def _footnoteMarker(self, refBook, noteRef): if self._context.config.epub3: refString = '<sup><a epub:type="noteref" href="#%s%d">[%d]</a></sup>' % (refBook, noteRef, noteRef) else: refString = '<sup><a href="#%s%d" id="Ref%s%d">[%d]</a></sup>' % (refBook, noteRef, refBook, noteRef, noteRef) return refString def _writeFootnoteMarker(self, refBook, noteRef): self._writingFootnoteMarker = True refString = self._footnoteMarker(refBook, noteRef) self._writeHtml(refString) self._writingFootnoteMarker = False def _lineHtml(self, attrs): lineType = self._getAttributeValue(attrs, 'type') lineSubType = self._getAttributeValue(attrs, 'subType') lineClass = 'poetic-line' if lineType is None: level = self._getAttributeValue(attrs, 'level') if level is not None: lineType = 'x-indent-%s' % level if lineType is not None: lineClass = '%s %s' % (lineClass, lineType) if lineSubType is not None: lineClass = '%s %s' % (lineClass, lineSubType) htmlTag = '<div class="%s">' % lineClass return htmlTag def _startFootnote(self, attrs): # attrs not used here but may be used in overiding function footnoteNo = self._footnotes.newFootnote(self._osisIDWork, '') self._writeFootnoteMarker(self._osisIDWork, footnoteNo) self._inFootnote = True def _generateParaTag(self, attrs, overrideSubType = None): pClass = '' subClass = '' pType = self._getAttributeValue(attrs, 'type') if pType is not None: pClass = pType subType = overrideSubType if subType is None: subType = self._getAttributeValue(attrs, 'subType') if subType is not None: subClass = subType if pClass != '': pClass = subClass elif subClass != '': pClass += ' ' pClass += subClass paraTag = '<p>' if pClass != '': paraTag = '<p class="%s">' % pClass return paraTag def _handleHi(self, attrs): attributes = '' if not self._ignoreText: hiType = self._getAttributeValue(attrs, 'type') if hiType == 'bold': self._hiHtmlTag[self._hiLevel] = 'b' elif hiType == 'emphasis': self._hiHtmlTag[self._hiLevel] = 'em' elif hiType == 'italic': self._hiHtmlTag[self._hiLevel] = 'i' elif hiType == 'line-through': self._hiHtmlTag[self._hiLevel] = 's' elif hiType == 'sub': self._hiHtmlTag[self._hiLevel] = 'sub' elif hiType == 'super': self._hiHtmlTag[self._hiLevel] = 'sup' elif hiType == 'underline': self._hiHtmlTag[self._hiLevel] = 'u' elif hiType == 'small-caps': self._hiHtmlTag[self._hiLevel] = 'span' attributes = ' style="font-variant:small-caps;"' else: self._hiHtmlTag[self._hiLevel] = '' print 'Unsupported hi type %s' % hiType if self._hiHtmlTag[self._hiLevel] != '': html = '<%s%s>' % (self._hiHtmlTag[self._hiLevel], attributes) self._writeHtml(html) else: self._hiHtmlTag[self._hiLevel] = '' self._hiLevel += 1 def _processReference(self,attrs): # reference tags are expected but are ignored # apart from glossary references refType = self._getAttributeValue(attrs, 'type') if refType == "x-glossary" or refType == "x-glosslink": html = '<span class="x-glossary-link">' if self._context.outputFmt == 'fb2': # This will be lost in conversion to FB2, # so instead use marker which will be picked up by FB2 post-processor html = '%&x-glossary-link&%' self._writeHtml(html) self._inGlossaryRef = True elif self._figHtml != '': self._ignoreText = True def _checkGeneratePara(self): if not self._inParagraph and not self._inTitle and not self._inGeneratedPara and not self._inCaption and not self._lineGroupPara and not self._inTable and not self._inFootnote: self._startGeneratedPara()