def testFunc(self, file_name=file_name): assert file_name.endswith(".src.html") base_path = file_name[:-len(".src.html")] kwargs = {} try: options_file_name = base_path + ".options" with open(options_file_name, "r") as options_file: kwargs = json.load(options_file) except IOError: pass default_processes = ["filter", "sub", "toc", "xref", "annotate"] new_processes = kwargs.get("processes", []) assert not set(default_processes) & set(new_processes) kwargs["processes"] = default_processes + new_processes # Sort attributes alphabetically by default. kwargs["alphabetical_attributes"] = True try: output = StringIO.StringIO() # Get the input input = open(file_name, "rb") tree = generator.fromFile(input, **kwargs) input.close() # Get the output generator.toFile(tree, output, **kwargs) # Get the expected result expectedfp = open(base_path + ".html", "rb") expected = expectedfp.read() expectedfp.close() # Run the test self.assertEquals(output.getvalue(), expected) except IOError as err: self.fail(err)
def testFunc(self, file_name=file_name): try: # Get the input input = open(file_name, "rb") tree = generator.fromFile(input) input.close() # Get the output output = StringIO.StringIO() generator.toFile(tree, output) # Get the expected result expected = open(file_name[:-9] + ".html", "rb") # Run the test self.assertEquals(output.getvalue(), expected.read()) # Close the files output.close() expected.close() except IOError, err: self.fail(err)
'w3c_compat_xref_normalization': False, } print 'indexing' filtered.seek(0) tree = generator.fromFile(filtered, **opts) filtered.close() try: os.makedirs('output/%s' % spec) except: pass if spec == 'html': from glob import glob for name in glob('output/html/*.html'): os.remove(name) output = open('output/html/single-page.html', 'wb') else: output = open('output/%s/Overview.html' % spec, 'wb') generator.toFile(tree, output, **opts) output.close() if spec == 'html': print 'splitting' import spec_splitter spec_splitter.w3c = True spec_splitter.main('output/html/single-page.html', 'output/html')
def main(spec, spec_dir, branch="master"): conf = None try: conf = config.load_config()[spec] except KeyError: invoked_incorrectly() if 'select' in conf: select = conf['select'] else: select = spec try: if not spec_dir: spec_dir = os.path.join(conf["output"], spec) except KeyError: sys.stderr.write("error: Must specify output directory for %s! \ Check default-config.json.\n" % spec) exit() cur_dir = os.path.abspath(os.path.dirname(__file__)) os.chdir(conf["path"]) print "parsing" source = open('source') after_microsyntax = StringIO() parser_microsyntax.main(source, after_microsyntax) after_microsyntax.seek(0) succint = StringIO() bs.main(after_microsyntax, succint) succint.seek(0) filtered = StringIO() try: boilerplate.main(succint, filtered, select, branch) except IOError: sys.stderr.write("error: Problem loading boilerplate for %s. \ Are you on the correct branch?\n" % spec) exit() succint.close() # See http://hg.gsnedders.com/anolis/file/tip/anolis opts = { 'allow_duplicate_dfns': True, 'disable': None, 'escape_lt_in_attrs': False, 'escape_rcdata': False, 'force_html4_id': False, 'indent_char': u' ', 'inject_meta_charset': False, 'max_depth': 6, 'min_depth': 2, 'minimize_boolean_attributes': False, 'newline_char': u'\n', 'omit_optional_tags': False, 'output_encoding': 'utf-8', 'parser': 'html5lib', 'processes': set(['toc', 'xref', 'sub']), 'profile': False, 'quote_attr_values': True, 'serializer': 'html5lib', 'space_before_trailing_solidus': False, 'strip_whitespace': None, 'use_best_quote_char': False, 'use_trailing_solidus': False, 'w3c_compat_class_toc': False, 'w3c_compat_crazy_substitutions': False, 'w3c_compat_substitutions': False, 'w3c_compat': True, 'w3c_compat_xref_a_placement': False, 'w3c_compat_xref_elements': False, 'w3c_compat_xref_normalization': False, } if "anolis" in conf: opts.update(conf["anolis"]) if spec == "srcset": import html5lib print 'munging (before anolis)' filtered.seek(0) pre_anolis_buffer = StringIO() # Parse parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml')) tree = parser.parse(filtered, encoding='utf-8') # Move introduction above conformance requirements introduction = tree.findall("//*[@id='introduction']")[0] intro_ps = introduction.xpath("following-sibling::*") target = tree.findall("//*[@id='conformance-requirements']")[0] target.addprevious(introduction) target = introduction target.addnext(intro_ps[2]) target.addnext(intro_ps[1]) target.addnext(intro_ps[0]) # Serialize tokens = html5lib.treewalkers.getTreeWalker('lxml')(tree) serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False) for text in serializer.serialize(tokens, encoding='utf-8'): pre_anolis_buffer.write(text) filtered = pre_anolis_buffer print 'indexing' filtered.seek(0) tree = generator.fromFile(filtered, **opts) filtered.close() # fixup nested dd's and dt's produced by lxml for dd in tree.findall('//dd/dd'): if list(dd) or dd.text.strip(): dd.getparent().addnext(dd) else: dd.getparent().remove(dd) for dt in tree.findall('//dt/dt'): if list(dt) or dt.text.strip(): dt.getparent().addnext(dt) else: dt.getparent().remove(dt) if spec == "microdata": print 'munging' import lxml # get the h3 for the misplaced section (it has no container) section = tree.xpath("//h3[@id = 'htmlpropertiescollection']")[0] # then get all of its following siblings that have the h2 for the next section as # a following sibling themselves. Yeah, XPath doesn't suck. section_content = section.xpath("following-sibling::*[following-sibling::h2[@id='introduction']]") target = tree.xpath("//h2[@id = 'converting-html-to-other-formats']")[0].getparent() target.addprevious(section) for el in section_content: target.addprevious(el) section.xpath("span")[0].text = "6.1 " # move the toc as well link = tree.xpath("//ol[@class='toc']//a[@href='#htmlpropertiescollection']")[0] link.xpath("span")[0].text = "6.1 " tree.xpath("//ol[@class='toc']/li[a[@href='#microdata-dom-api']]")[0].append(link.getparent().getparent()) if spec == "srcset": print 'munging (after anolis)' # In the WHATWG spec, srcset="" is simply an aspect of # HTMLImageElement and not a separate feature. In order to keep # the HTML WG's srcset="" spec organized, we have to move some # things around in the final document. # Move "The srcset IDL attribute must reflect..." reflect_the_content_attribute = tree.findall("//div[@class='impl']")[0] target = tree.find("//div[@class='note']") target.addprevious(reflect_the_content_attribute) # Move "The IDL attribute complete must return true..." note_about_complete = tree.findall("//p[@class='note']")[5] p_otherwise = note_about_complete.xpath("preceding-sibling::p[position()=1]")[0] ul_conditions = p_otherwise.xpath("preceding-sibling::ul[position()=1]")[0] p_start = ul_conditions.xpath("preceding-sibling::p[position()=1]")[0] target.addnext(note_about_complete) target.addnext(p_otherwise) target.addnext(ul_conditions) target.addnext(p_start) try: os.makedirs(spec_dir) except: pass if spec == 'html': print 'cleaning' from glob import glob for name in glob("%s/*.html" % spec_dir): os.remove(name) output = StringIO() else: output = open("%s/Overview.html" % spec_dir, 'wb') generator.toFile(tree, output, **opts) if spec != 'html': output.close() else: value = output.getvalue() if "<!--INTERFACES-->\n" in value: print 'interfaces' from interface_index import interface_index output.seek(0) index = StringIO() interface_index(output, index) value = value.replace("<!--INTERFACES-->\n", index.getvalue(), 1) index.close() output = open("%s/single-page.html" % spec_dir, 'wb') output.write(value) output.close() value = '' print 'splitting' import spec_splitter spec_splitter.w3c = True spec_splitter.no_split_exceptions = conf.get("no_split_exceptions", False) spec_splitter.minimal_split_exceptions = conf.get("minimal_split_exceptions", False) spec_splitter.main("%s/single-page.html" % spec_dir, spec_dir) print 'entities' entities = open(os.path.join(cur_dir, "boilerplate/entities.inc")) json = open("%s/entities.json" % spec_dir, 'w') from entity_processor_json import entity_processor_json entity_processor_json(entities, json) entities.close() json.close() # copying dependencies def copy_dependencies (targets): import types if not isinstance(targets, types.ListType): targets = [targets] for target in targets: os.system("/bin/csh -i -c '/bin/cp -R %s %s'" % (os.path.join(conf["path"], target), spec_dir)) print "copying" if spec == "html": copy_dependencies(["images", "fonts", "404/*", "switcher", "js"]) elif spec == "2dcontext": copy_dependencies(["images", "fonts"]) else: copy_dependencies("fonts") # fix the styling of the 404 if spec == "html": link = tree.xpath("//link[starts-with(@href, 'http://www.w3.org/StyleSheets/TR/')]")[0].get("href") path = os.path.join(spec_dir, "404.html") with open(path) as data: html404 = data.read() html404 = html404.replace("http://www.w3.org/StyleSheets/TR/W3C-ED", link) with open(path, "w") as data: data.write(html404)
def main(spec, spec_dir, branch="master"): conf = None try: conf = config.load_config()[spec] except KeyError: invoked_incorrectly() if "select" in conf: select = conf["select"] else: select = spec try: if not spec_dir: if conf.get("bareOutput", False): spec_dir = conf["output"] else: spec_dir = os.path.join(conf["output"], spec) except KeyError: sys.stderr.write( "error: Must specify output directory for %s! \ Check default-config.json.\n" % spec ) exit() cur_dir = os.path.abspath(os.path.dirname(__file__)) os.chdir(conf["path"]) print "parsing" source = open("source") after_microsyntax = StringIO() parser_microsyntax.main(source, after_microsyntax) after_microsyntax.seek(0) succint = StringIO() bs.main(after_microsyntax, succint) succint.seek(0) filtered = StringIO() if spec == "microdata": md_content = succint.read() md_content = re.sub( '<h2 id="iana">IANA considerations</h2>', '<!--BOILERPLATE microdata-extra-section--><h2 id="iana">IANA considerations</h2>', md_content, ) succint = StringIO() succint.write(md_content) succint.seek(0) try: boilerplate.main(succint, filtered, select, branch) except IOError: sys.stderr.write( "error: Problem loading boilerplate for %s. \ Are you on the correct branch?\n" % spec ) exit() succint.close() # See http://hg.gsnedders.com/anolis/file/tip/anolis opts = { "allow_duplicate_dfns": True, "disable": None, "escape_lt_in_attrs": False, "escape_rcdata": False, "force_html4_id": False, "indent_char": u" ", "inject_meta_charset": False, "max_depth": 6, "min_depth": 2, "minimize_boolean_attributes": False, "newline_char": u"\n", "omit_optional_tags": False, "output_encoding": "utf-8", "parser": "html5lib", "processes": set(["toc", "xref", "sub"]), "profile": False, "quote_attr_values": True, "serializer": "html5lib", "space_before_trailing_solidus": False, "strip_whitespace": None, "use_best_quote_char": False, "use_trailing_solidus": False, "w3c_compat_class_toc": False, "w3c_compat_crazy_substitutions": False, "w3c_compat_substitutions": False, "w3c_compat": True, "w3c_compat_xref_a_placement": False, "w3c_compat_xref_elements": False, "w3c_compat_xref_normalization": False, } if "anolis" in conf: opts.update(conf["anolis"]) if spec == "srcset": print "munging (before anolis)" filtered.seek(0) pre_anolis_buffer = StringIO() # Parse parser = html5lib.html5parser.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml")) tree = parser.parse(filtered, encoding="utf-8") # Move introduction above conformance requirements introduction = tree.findall("//*[@id='introduction']")[0] intro_ps = introduction.xpath("following-sibling::*") target = tree.findall("//*[@id='conformance-requirements']")[0] target.addprevious(introduction) target = introduction target.addnext(intro_ps[2]) target.addnext(intro_ps[1]) target.addnext(intro_ps[0]) # Serialize tokens = html5lib.treewalkers.getTreeWalker("lxml")(tree) serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False) for text in serializer.serialize(tokens, encoding="utf-8"): pre_anolis_buffer.write(text) filtered = pre_anolis_buffer # replace data-x with data-anolis-xref print "fixing xrefs" filtered.seek(0) # Parse builder = treebuilders.getTreeBuilder("lxml", etree) try: parser = html5lib.HTMLParser(tree=builder, namespaceHTMLElements=False) except TypeError: parser = html5lib.HTMLParser(tree=builder) tree = parser.parse(filtered, encoding="utf-8") # Move introduction above conformance requirements data_x = tree.findall("//*[@data-x]") non_alphanumeric_spaces = re.compile(r"[^a-zA-Z0-9 \-\_\/\|]+") for refel in data_x: refel.attrib["data-anolis-xref"] = refel.get("data-x") if refel.tag == "dfn" and not refel.get("id", False) and refel.attrib["data-anolis-xref"]: refel.attrib["id"] = generateID(refel.attrib["data-anolis-xref"], refel) del refel.attrib["data-x"] # utils.ids = {} print "indexing" # filtered.seek(0) # tree = generator.fromFile(filtered, **opts) generator.process(tree, **opts) filtered.close() # fixup nested dd's and dt's produced by lxml for dd in tree.findall("//dd/dd"): if list(dd) or dd.text.strip(): dd.getparent().addnext(dd) else: dd.getparent().remove(dd) for dt in tree.findall("//dt/dt"): if list(dt) or dt.text.strip(): dt.getparent().addnext(dt) else: dt.getparent().remove(dt) # remove unused references print "processing references" for dt in tree.findall("//dt[@id]"): refID = dt.get("id") if refID.startswith("refs") and len(tree.findall("//a[@href='#%s']" % refID)) == 0: next = dt.getnext() while next.tag != "dd": next = next.getnext() dt.getparent().remove(next) dt.getparent().remove(dt) elif refID.startswith("refs"): dd = dt.getnext() while dd.tag != "dd": dd = dd.getnext() links = dd.findall(".//a[@href]") for link in links: if link is not None: wrap = link.getparent() link.tail = " (URL: " idx = wrap.index(link) url = etree.Element("a", href=link.get("href")) url.text = link.get("href") wrap.insert(idx + 1, url) url.tail = ")" if spec == "microdata": print "munging (after anolis)" # get the h3 for the misplaced section (it has no container) section = tree.xpath("//h3[@id = 'htmlpropertiescollection']")[0] # then get all of its following siblings that have the h2 for the next section as # a following sibling themselves. Yeah, XPath doesn't suck. section_content = section.xpath("following-sibling::*[following-sibling::h2[@id='introduction']]") target = tree.xpath("//h2[@id = 'converting-html-to-other-formats']")[0].getparent() target.addprevious(section) for el in section_content: target.addprevious(el) section.xpath("span")[0].text = "6.1 " # move the toc as well link = tree.xpath("//ol[@class='toc']//a[@href='#htmlpropertiescollection']")[0] link.xpath("span")[0].text = "6.1 " tree.xpath("//ol[@class='toc']/li[a[@href='#microdata-dom-api']]")[0].append(link.getparent().getparent()) if spec == "srcset": print "munging (after anolis)" # In the WHATWG spec, srcset="" is simply an aspect of # HTMLImageElement and not a separate feature. In order to keep # the HTML WG's srcset="" spec organized, we have to move some # things around in the final document. # Move "The srcset IDL attribute must reflect..." reflect_the_content_attribute = tree.findall("//div[@class='impl']")[0] target = tree.find("//div[@class='note']") target.addprevious(reflect_the_content_attribute) # Move "The IDL attribute complete must return true..." note_about_complete = tree.findall("//p[@class='note']")[4] p_otherwise = note_about_complete.xpath("preceding-sibling::p[position()=1]")[0] ul_conditions = p_otherwise.xpath("preceding-sibling::ul[position()=1]")[0] p_start = ul_conditions.xpath("preceding-sibling::p[position()=1]")[0] target.addnext(note_about_complete) target.addnext(p_otherwise) target.addnext(ul_conditions) target.addnext(p_start) try: os.makedirs(spec_dir) except: pass if spec == "html": print "cleaning" from glob import glob for name in glob("%s/*.html" % spec_dir): os.remove(name) output = StringIO() else: output = open("%s/Overview.html" % spec_dir, "wb") generator.toFile(tree, output, **opts) if spec != "html": output.close() else: value = output.getvalue() if "<!--INTERFACES-->\n" in value: print "interfaces" from interface_index import interface_index output.seek(0) index = StringIO() interface_index(output, index) value = value.replace("<!--INTERFACES-->\n", index.getvalue(), 1) index.close() output = open("%s/single-page.html" % spec_dir, "wb") output.write(value) output.close() value = "" print "splitting" import spec_splitter spec_splitter.w3c = True spec_splitter.no_split_exceptions = conf.get("no_split_exceptions", False) spec_splitter.minimal_split_exceptions = conf.get("minimal_split_exceptions", False) spec_splitter.main("%s/single-page.html" % spec_dir, spec_dir) print "entities" entities = open(os.path.join(cur_dir, "boilerplate/entities.inc")) json = open("%s/entities.json" % spec_dir, "w") from entity_processor_json import entity_processor_json entity_processor_json(entities, json) entities.close() json.close() # copying dependencies def copy_dependencies(targets): import types if not isinstance(targets, types.ListType): targets = [targets] if os.name == "nt": for target in targets: os.system("xcopy /s %s %s" % (os.path.join(conf["path"], target), spec_dir)) else: for target in targets: os.system("/bin/csh -i -c '/bin/cp -R %s %s'" % (os.path.join(conf["path"], target), spec_dir)) print "copying" if spec == "html": if os.name == "nt": dirs = ["images", "fonts", "404", "switcher", "js"] else: dirs = ["images", "fonts", "404/*", "switcher", "js"] copy_dependencies(dirs) elif spec == "2dcontext": copy_dependencies(["images", "fonts"]) else: copy_dependencies("fonts") # fix the styling of the 404 if spec == "html": link = tree.xpath("//link[starts-with(@href, 'http://www.w3.org/StyleSheets/TR/')]")[0].get("href") path = os.path.join(spec_dir, "404.html") with open(path) as data: html404 = data.read() html404 = html404.replace("http://www.w3.org/StyleSheets/TR/W3C-ED", link) with open(path, "w") as data: data.write(html404)