def latex_to_html(texfilename, embargo_chapters=[], make_url_to_page=lambda x: x, make_url_to_figure=lambda x: x, skip_prologue=False, footnotes_inline=False, toc_placeholder="", condense_simple_sections=False): doc = TeX(file=texfilename).parse() doc.normalize() bookcontent = [] context = {"is_in_footnotes": False, "fnid": 0} re_url = re.compile( r"(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" ) def write(s, escape=True): if isinstance(s, Node): s = s.textContent s = s.encode("utf8") if escape: s = cgi.escape(s) if not context["is_in_footnotes"]: bookcontent[-1][3].write(s) else: bookcontent[-1][5].write(s) def write_raw(s): write(s, escape=False) class Renderer: pass_through = ("#document", "document", "appendix", "bgroup", "titlepage") skip = ("documentclass", "usepackage", "setdefaultlanguage", "restylefloat", "floatstyle", "makeindex", "newcommand", "addcontentsline", "printindex", "Index", "midrule", "newif", "newenvironment", "minipage", "vfill", "vspace", "plastexfalse") # specify either a tag name as a string (e.g. "p") # or a tuple of HTML to wrap around the content (e.g. ("<p>", "</p>")). wrap = { "slash": ("\n<br/>", ""), "newpage": ("\n<hr/>", ""), "clearpage": ("\n<hr/>", ""), "emph": "i", "it": "i", "underline": "u", "textbf": "b", "bf": "b", "bfseries": "b", "tt": "tt", "bigskip": ("\n\n<p> </p>", ""), "_": ("_", ""), "$": ("$", ""), "%": ("%", ""), "&": ("&", ""), "#": ("#", ""), "active::~": (" ", ""), " ": (" ", ""), "quotation": ("\n<blockquote>", "</blockquote>"), "center": ("\n<center>", "</center>"), "centering": ("\n<center>", "</center>"), "hspace": ("<span> </span>", ""), "enumerate": "ol", "itemize": "ul", "item": "li", "verbatim": "pre", "tabular": "table", "ArrayRow": "tr", "ArrayCell": "td", "hline": ("", ""), # use in tables is weird "math": "i", "active::_": "sub", "ldots": (" . . . ", ""), "textasciitilde": ("~", ""), "rule": ("<hr/>", ""), "-": ("", ""), # discretionary hyphen "copyright": ("©", ""), "cleardoublepage": ("<hr/>", ""), } def __init__(self): self.metadata = {} self.counters = {} self.indent = True self.labels = {} self.cur_figure = None self.hold_par = False self.has_par_content = False counter_order = ("chapter", "section", "subsection", "subsubsection") def next_counter(self, counter): # when we go to a new chapter, clear the section counter, etc. if counter in self.counter_order: for i in xrange( self.counter_order.index(counter) + 1, len(self.counter_order)): if self.counter_order[i] in self.counters: del self.counters[self.counter_order[i]] self.counters[counter] = self.counters.get(counter, 0) + 1 return self.counters[counter] def title(self, node): self.metadata[ node. nodeName] = node.textContent + "" # convert from DOM.Text to unicode def author(self, node): self.metadata[ node. nodeName] = node.textContent + "" # convert from DOM.Text to unicode def maketitle(self, node): write_raw("<h1>") write(self.metadata.get("title", "")) write_raw("</h1>\n") write_raw("<h2>") write(self.metadata.get("author", "")) write_raw("</h2>\n") def heading_start(self, node, elemname): is_numbered = (node.attributes.get("*modifier*", "") != "*") if is_numbered: section_number = str(self.next_counter(node.nodeName)) + ". " if self.has_par_content or not condense_simple_sections: buf = StringIO() bookcontent.append(( node.nodeName, list( self.counters.get(x, None) for x in self.counter_order), node.attributes.get("title", "").textContent + "", # convert from DOM.Text to unicode buf, [], StringIO(), # footnotes )) self.has_par_content = False else: # store an extraneous TOC entry within this page bookcontent[-1][4].append(( node.nodeName, list( self.counters.get(x, None) for x in self.counter_order), node.attributes.get("title", "").textContent + "", # convert from DOM.Text to unicode )) write_raw("<%s>" % elemname) if is_numbered: write(section_number) write(node.attributes.get("title", "")) write_raw("</%s>\n" % elemname) def chapter_start(self, node): self.heading_start(node, "h1") def section_start(self, node): self.heading_start(node, "h2") def subsection_start(self, node): self.heading_start(node, "h3") def subsubsection_start(self, node): self.heading_start(node, "h4") def tableofcontents(self, node): write_raw(toc_placeholder) def index(self, node): pass def par_start(self, node): if self.hold_par: return if node.textContent.strip() == "": return self.has_par_content = True write_raw("\n<p class='%s'>" % ("indent" if self.indent else "noindent")) self.indent = True def par_end(self, node): if self.hold_par: return if node.textContent.strip() == "": return write_raw("</p>") def noindent(self, node): # not working, seems to ocurr *after* the par node self.indent = False def small_start(self, node): write_raw( "<span style='font-size: 85%'>" ) # don't know if we are wrapping block level or inline content def small_end(self, node): write_raw("</span>") def large_start(self, node): write_raw( "<span style='font-size: 115%'>" ) # don't know if we are wrapping block level or inline content def large_end(self, node): write_raw("</span>") def Large_start(self, node): write_raw( "<span style='font-size: 125%'>" ) # don't know if we are wrapping block level or inline content def Large_end(self, node): write_raw("</span>") def huge_start(self, node): write_raw( "<span style='font-size: 150%'>" ) # don't know if we are wrapping block level or inline content def huge_end(self, node): write_raw("</span>") def textsc_start(self, node): write_raw( "<span style='font-variant:small-caps;'>" ) # don't know if we are wrapping block level or inline content def textsc_end(self, node): write_raw("</span>") def footnotesize_start(self, node): write_raw( "<span style='font-size: 80%'>" ) # don't know if we are wrapping block level or inline content def footnotesize_end(self, node): write_raw("</span>") def url(self, node): if "url" not in node.attributes: raise Exception("\\url without url attribute: " + node.toXML()) write_raw("<a href=\"") write(node.attributes["url"]) write_raw("\" target=\"_blank\">") write(node.attributes["url"]) write_raw("</a>") def href(self, node): write_raw("<a href=\"") write(node.attributes["url"]) write_raw("\" target=\"_blank\">") write( node.attributes["self"] if node.attributes["self"] else "???") write_raw("</a>") def figure_start(self, node): self.cur_figure = self.next_counter("figure") write_raw("<div class='figure'>") def figure_end(self, node): self.cur_figure = None write_raw("</div>") def caption_start(self, node): if self.hold_par: raise Exception("Nested captions.") self.hold_par = True write_raw("<p class='caption'>") write("Figure " + str(self.counters.get("figure", "?")) + ". ") def caption_end(self, node): self.hold_par = False write_raw("</p>") def graphic(self, node): # use \newcommand{\includegraphics}[2][]{\graphic #2} fn = node.nextSibling.textContent + "" # convert from DOM.Text to unicode fn = fn.replace(".pdf", "").replace(".png", "") write_raw("<div class='img_container'><img width='100%' src='") write(make_url_to_figure(fn)) write_raw("'/></div>") # The next sibling has a text node with the image filename. I'm # not sure where it is coming from. Clear it out. Removing the # node somehow causes a parent <p> to not be closed. return "IGNORE_NEXT_SIBLING" def footnote_start(self, node): c = self.next_counter("footnote") if footnotes_inline: write_raw("<span class='footnote_marker' title='") write(node.textContent) write_raw("'>[" + str(c) + "]</span>") write_raw("<span id='footnote_" + str(c) + "' class='footnote_entry' style='display: none'>" + str(c) + ". ") else: write_raw( "<a name='fn_" + str(context["fnid"]) + "_anchor'></a><sup style='font-size: 75%'><a href='#fn_" + str(context["fnid"]) + "_note'>" + str(c) + "</a></sup>") context["is_in_footnotes"] = True write_raw("<p style='font-size: 90%'><a name='fn_" + str(context["fnid"]) + "_note'></a><a href='#fn_" + str(context["fnid"]) + "_anchor'>" + str(c) + "</a>. ") context["fnid"] += 1 def footnote_end(self, node): if footnotes_inline: write_raw("</span>") else: write_raw("</p>") context["is_in_footnotes"] = False def label(self, node): # store a tuple to the index of the book segment we are in (for generating links), # the section number, and the figure number if we're in a figure. write_raw("<span class='label' id='label_") write(node.attributes["label"]) write_raw("'/>") self.labels[node.attributes["label"]] = ( len(bookcontent) - 1, ".".join([ str(self.counters[x]) for x in self.counter_order if x != None and x in self.counters ]), self.cur_figure) def ref(self, node): # must match the regex at the end lab = node.attributes["label"] write_raw("<reference>") write_raw(lab) write_raw("</reference>") renderer = Renderer() def process_node(node): if node.nodeType == Node.TEXT_NODE: write(node.textContent) elif node.nodeType in (Node.DOCUMENT_NODE, Node.ELEMENT_NODE): nodeName = node.nodeName nodeName = nodeName.replace("\\", "slash") if nodeName in renderer.pass_through: pass elif nodeName in renderer.skip: return elif nodeName in renderer.wrap: w = renderer.wrap[nodeName] if isinstance(w, tuple): write_raw(w[0]) else: write_raw("<" + w + ">") elif hasattr(renderer, nodeName + "_start"): getattr(renderer, nodeName + "_start")(node) elif hasattr(renderer, nodeName): return getattr(renderer, nodeName)(node) else: write_raw("<p>UNHANDLED NODE: ") write(node.nodeName + ": ") write(node.toXML()) write_raw("</p>\n") cmd = None for child in node: if cmd == "IGNORE_NEXT_SIBLING": cmd = None continue cmd = process_node(child) if nodeName in renderer.wrap: w = renderer.wrap[nodeName] if isinstance(w, tuple): write_raw(w[1]) else: write_raw("</" + w + ">") elif hasattr(renderer, nodeName + "_end"): return getattr(renderer, nodeName + "_end")(node) bookcontent.append((None, ["prologue"], None, StringIO(), [], StringIO())) process_node(doc) content_map = {} book_pages = {} toc = [] for i, entry in enumerate(bookcontent): if i == 0 and skip_prologue: continue # skip prologue material if entry[1] and entry[1][0] in embargo_chapters: continue # block certain chapters entrypagename = "-".join([str(e) for e in entry[1] if e != None]) if entry[2]: entrypagename += "/" + slugify(" ".join( [e for e in entry[2].split(" ") if re.match("[A-Za-z]{3}", e)])) book_pages[entrypagename] = len(toc) content_map[i] = len(toc) toc.append({ "indent": len([e for e in entry[1] if e != None]), "number": ".".join([str(e) for e in entry[1] if e != None]), "name": entry[2], "href": make_url_to_page(entrypagename), "content": entry[3].getvalue(), "footnotes": entry[5].getvalue(), "extraneous_entries": [{ "indent": len([e for e in e2[1] if e != None]), "number": ".".join([str(e) for e in e2[1] if e != None]), "name": e2[2], } for e2 in entry[4]], }) def fill_ref(match): if not match.group(1) in renderer.labels: return "[unknown reference]" section_index, section_number, figure_counter = renderer.labels[ match.group(1)] if figure_counter: text = str(figure_counter) else: text = section_number if section_index not in content_map or figure_counter: return cgi.escape(text) return str("<a class=\"reference\" href=\"" + cgi.escape(toc[content_map[section_index]]["href"]) + "\">" + cgi.escape(text) + "</a>") for entry in toc: entry["content"] = re.sub("<reference>(.*?)</reference>", fill_ref, entry["content"]) return {"pages": book_pages, "toc": toc}
def latex_to_html(texfilename, embargo_chapters=[], make_url_to_page=lambda x : x, make_url_to_figure=lambda x : x, skip_prologue=False, footnotes_inline=False, toc_placeholder="", condense_simple_sections=False): doc = TeX(file=texfilename).parse() doc.normalize() bookcontent = [] context = { "is_in_footnotes": False, "fnid": 0 } re_url = re.compile(r"(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))") def write(s, escape=True): if isinstance(s, Node): s = s.textContent s = s.encode("utf8") if escape: s = cgi.escape(s) if not context["is_in_footnotes"]: bookcontent[-1][3].write(s) else: bookcontent[-1][5].write(s) def write_raw(s): write(s, escape=False) class Renderer: pass_through = ("#document", "document", "appendix", "bgroup", "titlepage") skip = ("documentclass", "usepackage", "setdefaultlanguage", "restylefloat", "floatstyle", "makeindex", "newcommand", "addcontentsline", "printindex", "Index", "midrule", "newif", "newenvironment", "minipage", "vfill", "vspace", "plastexfalse") # specify either a tag name as a string (e.g. "p") # or a tuple of HTML to wrap around the content (e.g. ("<p>", "</p>")). wrap = { "slash": ("\n<br/>", ""), "newpage": ("\n<hr/>", ""), "clearpage": ("\n<hr/>", ""), "emph": "i", "it": "i", "underline": "u", "textbf": "b", "bf": "b", "bfseries": "b", "tt": "tt", "bigskip": ("\n\n<p> </p>", ""), "_": ("_", ""), "$": ("$", ""), "%": ("%", ""), "&": ("&", ""), "#": ("#", ""), "active::~": (" ", ""), " ": (" ", ""), "quotation": ("\n<blockquote>", "</blockquote>"), "center": ("\n<center>", "</center>"), "centering": ("\n<center>", "</center>"), "hspace": ("<span> </span>", ""), "enumerate": "ol", "itemize": "ul", "item": "li", "verbatim": "pre", "tabular": "table", "ArrayRow": "tr", "ArrayCell": "td", "hline": ("", ""), # use in tables is weird "math": "i", "active::_": "sub", "ldots": (" . . . ", ""), "textasciitilde": ("~", ""), "rule": ("<hr/>", ""), "-": ("", ""), # discretionary hyphen "copyright": ("©", ""), "cleardoublepage": ("<hr/>", ""), } def __init__(self): self.metadata = { } self.counters = { } self.indent = True self.labels = { } self.cur_figure = None self.hold_par = False self.has_par_content = False counter_order = ("chapter", "section", "subsection", "subsubsection") def next_counter(self, counter): # when we go to a new chapter, clear the section counter, etc. if counter in self.counter_order: for i in xrange(self.counter_order.index(counter) + 1, len(self.counter_order)): if self.counter_order[i] in self.counters: del self.counters[self.counter_order[i]] self.counters[counter] = self.counters.get(counter, 0) + 1 return self.counters[counter] def title(self, node): self.metadata[node.nodeName] = node.textContent + "" # convert from DOM.Text to unicode def author(self, node): self.metadata[node.nodeName] = node.textContent + "" # convert from DOM.Text to unicode def maketitle(self, node): write_raw("<h1>") write(self.metadata.get("title", "")) write_raw("</h1>\n") write_raw("<h2>") write(self.metadata.get("author", "")) write_raw("</h2>\n") def heading_start(self, node, elemname): is_numbered = (node.attributes.get("*modifier*", "") != "*") if is_numbered: section_number = str(self.next_counter(node.nodeName)) + ". " if self.has_par_content or not condense_simple_sections: buf = StringIO() bookcontent.append(( node.nodeName, list(self.counters.get(x, None) for x in self.counter_order), node.attributes.get("title", "").textContent+"", # convert from DOM.Text to unicode buf, [], StringIO(), # footnotes )) self.has_par_content = False else: # store an extraneous TOC entry within this page bookcontent[-1][4].append( ( node.nodeName, list(self.counters.get(x, None) for x in self.counter_order), node.attributes.get("title", "").textContent+"", # convert from DOM.Text to unicode ) ) write_raw("<%s>" % elemname) if is_numbered: write(section_number) write(node.attributes.get("title", "")) write_raw("</%s>\n" % elemname) def chapter_start(self, node): self.heading_start(node, "h1") def section_start(self, node): self.heading_start(node, "h2") def subsection_start(self, node): self.heading_start(node, "h3") def subsubsection_start(self, node): self.heading_start(node, "h4") def tableofcontents(self, node): write_raw(toc_placeholder) def index(self, node): pass def par_start(self, node): if self.hold_par: return if node.textContent.strip() == "": return self.has_par_content = True write_raw("\n<p class='%s'>" % ("indent" if self.indent else "noindent")) self.indent = True def par_end(self, node): if self.hold_par: return if node.textContent.strip() == "": return write_raw("</p>") def noindent(self, node): # not working, seems to ocurr *after* the par node self.indent = False def small_start(self, node): write_raw("<span style='font-size: 85%'>") # don't know if we are wrapping block level or inline content def small_end(self, node): write_raw("</span>") def large_start(self, node): write_raw("<span style='font-size: 115%'>") # don't know if we are wrapping block level or inline content def large_end(self, node): write_raw("</span>") def Large_start(self, node): write_raw("<span style='font-size: 125%'>") # don't know if we are wrapping block level or inline content def Large_end(self, node): write_raw("</span>") def huge_start(self, node): write_raw("<span style='font-size: 150%'>") # don't know if we are wrapping block level or inline content def huge_end(self, node): write_raw("</span>") def textsc_start(self, node): write_raw("<span style='font-variant:small-caps;'>") # don't know if we are wrapping block level or inline content def textsc_end(self, node): write_raw("</span>") def footnotesize_start(self, node): write_raw("<span style='font-size: 80%'>") # don't know if we are wrapping block level or inline content def footnotesize_end(self, node): write_raw("</span>") def url(self, node): if "url" not in node.attributes: raise Exception("\\url without url attribute: " + node.toXML()) write_raw("<a href=\"") write(node.attributes["url"]) write_raw("\" target=\"_blank\">") write(node.attributes["url"]) write_raw("</a>") def href(self, node): write_raw("<a href=\"") write(node.attributes["url"]) write_raw("\" target=\"_blank\">") write(node.attributes["self"] if node.attributes["self"] else "???") write_raw("</a>") def figure_start(self, node): self.cur_figure = self.next_counter("figure") write_raw("<div class='figure'>") def figure_end(self, node): self.cur_figure = None write_raw("</div>") def caption_start(self, node): if self.hold_par: raise Exception("Nested captions.") self.hold_par = True write_raw("<p class='caption'>") write("Figure " + str(self.counters.get("figure", "?")) + ". ") def caption_end(self, node): self.hold_par = False write_raw("</p>") def graphic(self, node): # use \newcommand{\includegraphics}[2][]{\graphic #2} fn = node.nextSibling.textContent+"" # convert from DOM.Text to unicode fn = fn.replace(".pdf", "").replace(".png", "") write_raw("<div class='img_container'><img width='100%' src='") write(make_url_to_figure(fn)) write_raw("'/></div>") # The next sibling has a text node with the image filename. I'm # not sure where it is coming from. Clear it out. Removing the # node somehow causes a parent <p> to not be closed. return "IGNORE_NEXT_SIBLING" def footnote_start(self, node): c = self.next_counter("footnote") if footnotes_inline: write_raw("<span class='footnote_marker' title='") write(node.textContent) write_raw("'>[" + str(c) + "]</span>") write_raw("<span id='footnote_" + str(c) + "' class='footnote_entry' style='display: none'>" + str(c) + ". ") else: write_raw("<a name='fn_" + str(context["fnid"]) + "_anchor'></a><sup style='font-size: 75%'><a href='#fn_" + str(context["fnid"]) + "_note'>" + str(c) + "</a></sup>") context["is_in_footnotes"] = True write_raw("<p style='font-size: 90%'><a name='fn_" + str(context["fnid"]) + "_note'></a><a href='#fn_" + str(context["fnid"]) + "_anchor'>" + str(c) + "</a>. ") context["fnid"] += 1 def footnote_end(self, node): if footnotes_inline: write_raw("</span>") else: write_raw("</p>") context["is_in_footnotes"] = False def label(self, node): # store a tuple to the index of the book segment we are in (for generating links), # the section number, and the figure number if we're in a figure. write_raw("<span class='label' id='label_") write(node.attributes["label"]) write_raw("'/>") self.labels[node.attributes["label"]] = (len(bookcontent)-1, ".".join([ str(self.counters[x]) for x in self.counter_order if x != None and x in self.counters ]), self.cur_figure) def ref(self, node): # must match the regex at the end lab = node.attributes["label"] write_raw("<reference>") write_raw(lab) write_raw("</reference>") renderer = Renderer() def process_node(node): if node.nodeType == Node.TEXT_NODE: write(node.textContent) elif node.nodeType in (Node.DOCUMENT_NODE, Node.ELEMENT_NODE): nodeName = node.nodeName nodeName = nodeName.replace("\\", "slash") if nodeName in renderer.pass_through: pass elif nodeName in renderer.skip: return elif nodeName in renderer.wrap: w = renderer.wrap[nodeName] if isinstance(w, tuple): write_raw(w[0]) else: write_raw("<" + w + ">") elif hasattr(renderer, nodeName + "_start"): getattr(renderer, nodeName + "_start")(node) elif hasattr(renderer, nodeName): return getattr(renderer, nodeName)(node) else: write_raw("<p>UNHANDLED NODE: ") write(node.nodeName + ": ") write(node.toXML()) write_raw("</p>\n") cmd = None for child in node: if cmd == "IGNORE_NEXT_SIBLING": cmd = None continue cmd = process_node(child) if nodeName in renderer.wrap: w = renderer.wrap[nodeName] if isinstance(w, tuple): write_raw(w[1]) else: write_raw("</" + w + ">") elif hasattr(renderer, nodeName + "_end"): return getattr(renderer, nodeName + "_end")(node) bookcontent.append( (None, ["prologue"], None, StringIO(), [], StringIO()) ) process_node(doc) content_map = { } book_pages = { } toc = [] for i, entry in enumerate(bookcontent): if i == 0 and skip_prologue: continue # skip prologue material if entry[1] and entry[1][0] in embargo_chapters: continue # block certain chapters entrypagename = "-".join([str(e) for e in entry[1] if e != None]) if entry[2]: entrypagename += "/" + slugify(" ".join([ e for e in entry[2].split(" ") if re.match("[A-Za-z]{3}", e) ])) book_pages[entrypagename] = len(toc) content_map[i] = len(toc) toc.append({ "indent": len([e for e in entry[1] if e != None]), "number": ".".join([str(e) for e in entry[1] if e != None]), "name": entry[2], "href": make_url_to_page(entrypagename), "content": entry[3].getvalue(), "footnotes": entry[5].getvalue(), "extraneous_entries": [{ "indent": len([e for e in e2[1] if e != None]), "number": ".".join([str(e) for e in e2[1] if e != None]), "name": e2[2], } for e2 in entry[4] ], }) def fill_ref(match): if not match.group(1) in renderer.labels: return "[unknown reference]" section_index, section_number, figure_counter = renderer.labels[match.group(1)] if figure_counter: text = str(figure_counter) else: text = section_number if section_index not in content_map or figure_counter: return cgi.escape(text) return str("<a class=\"reference\" href=\"" + cgi.escape(toc[content_map[section_index]]["href"]) + "\">" + cgi.escape(text) + "</a>") for entry in toc: entry["content"] = re.sub("<reference>(.*?)</reference>", fill_ref, entry["content"]) return { "pages": book_pages, "toc": toc }