Example #1
0
def transform_collapsibles(text):
    """Find simple collapsible elements and transform them to full html."""
    tree = parseFragment(text, container='div', treebuilder='etree',
                         namespaceHTMLElements=False)

    base_id = ''.join(filter(str.isdigit, str(time.time())))
    collapsibles = tree.findall('./div[@class="collapsible-item"]')
    for i, collapsible in enumerate(collapsibles):
        title = collapsible.find('./div[@class="collapsible-item-title"]')
        body = collapsible.find('./div[@class="collapsible-item-body"]')

        if title is not None and body is not None:
            title.tag = 'span'
            del title.attrib['class']

            body.tag = 'div'
            del body.attrib['class']

            final_html = render_to_string(
                'a4ckeditor/collapsible_fragment.html',
                dict(
                    id='a4ckeditor-collapsible-{}_{}'.format(base_id, i),
                    title=serialize(title),
                    body=serialize(body))
            )

            collapsible.clear()
            collapsible.append(parseFragment(final_html, treebuilder='etree',
                                             namespaceHTMLElements=False))

    return serialize(tree)
 def testEntityXML(self):
     doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
     tree = etree.fromstring(doc, parser=self.parser).getroottree()
     result = serialize(tree, tree="lxml", omit_optional_tags=False)
     self.assertEqual(
         """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""",
         result)
Example #3
0
 def run(self, text):
     parser = html5lib.HTMLParser(tokenizer=ForgeHTMLSanitizer)
     parsed = parser.parse(text)
     serializer = html5lib.serializer.HTMLSerializer()
     walker = html5lib.getTreeWalker("etree")
     stream = html5lib.filters.alphabeticalattributes.Filter(walker(parsed))
     out = ''.join(serializer.serialize(stream))
     return out
Example #4
0
def testEntityNoResolve(lxml_parser):
    doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
    tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
    result = serialize(tree,
                       tree="lxml",
                       omit_optional_tags=False,
                       resolve_entities=False)
    assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
 def run(self, text):
     parser = html5lib.HTMLParser(tokenizer=ForgeHTMLSanitizer)
     parsed = parser.parse(text)
     serializer = html5lib.serializer.HTMLSerializer()
     walker = html5lib.getTreeWalker("etree")
     stream = html5lib.filters.alphabeticalattributes.Filter(walker(parsed))
     out = ''.join(serializer.serialize(stream))
     return out
 def testEntityReplacement(self):
     doc = u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
     tree = etree.fromstring(doc, parser=self.parser).getroottree()
     result = serializer.serialize(tree,
                                   tree=u"lxml",
                                   omit_optional_tags=False)
     self.assertEqual(
         u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""",
         result)
Example #7
0
    def run(self):
        """
		Run the two steps of validation, and return the serialized version of the DOM Tree, ready to be displayed
		"""
        self.parse()
        self.complete_DOM()
        if PY3:
            from html5lib.serializer import serialize
            return serialize(self.domtree, tree='dom')
        else:
            return str(self.domtree.toxml(encoding="utf-8"))
Example #8
0
 def testEntityNoResolve(self):
     doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
     tree = etree.fromstring(doc, parser=self.parser).getroottree()
     result = serialize(tree, tree='lxml', omit_optional_tags=False,
                                   resolve_entities=False)
     self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
Example #9
0
def main(input, output):
  global no_split_exceptions
  if use_html5lib_parser or use_html5lib_serialiser:
      import html5lib
      import html5lib.serializer
      import html5lib.treewalkers

  index_page = 'Overview'

  # The document is split on all <h2> elements, plus the following specific elements
  # (which were chosen to split any pages that were larger than about 100-200KB, and
  # may need to be adjusted as the spec changes):
  split_exceptions = [
      'the-a-element', 'the-abbr-element', 'the-address-element',
      'the-area-element', 'the-article-element', 'the-aside-element',
      'the-audio-element', 'the-b-element', 'the-base-element',
      'the-bdi-element', 'the-bdo-element', 'the-blockquote-element',
      'the-body-element', 'the-br-element', 'the-button-element',
      'the-canvas-element', 'the-caption-element', 'the-cite-element',
      'the-code-element', 'the-col-element', 'the-colgroup-element',
      'the-command-element', 'the-datalist-element', 'the-dd-element',
      'the-del-element', 'the-details-element', 'the-dfn-element',
      'the-dir-element', 'the-div-element', 'the-dl-element',
      'the-dt-element', 'the-em-element', 'the-embed-element',
      'the-fieldset-element', 'the-figcaption-element', 'the-figure-element',
      'the-footer-element', 'the-form-element',
      'the-h1-h2-h3-h4-h5-and-h6-elements', 'the-head-element',
      'the-header-element', 'the-hgroup-element', 'the-hr-element',
      'the-html-element', 'the-i-element', 'the-iframe-element',
      'the-img-element', 'the-input-element', 'the-ins-element',
      'the-kbd-element', 'the-keygen-element', 'the-label-element',
      'the-legend-element', 'the-li-element', 'the-link-element',
      'the-map-element', 'the-mark-element', 'the-menu-element',
      'the-meta-element', 'the-meter-element', 'the-nav-element',
      'the-noscript-element', 'the-object-element', 'the-ol-element',
      'the-optgroup-element', 'the-option-element', 'the-output-element',
      'the-p-element', 'the-param-element', 'the-pre-element',
      'the-progress-element', 'the-q-element', 'the-rp-element',
      'the-rt-element', 'the-ruby-element', 'the-s-element',
      'the-samp-element', 'the-script-element', 'the-section-element',
      'the-select-element', 'the-small-element', 'the-source-element',
      'the-span-element', 'the-strong-element', 'the-style-element',
      'the-sub-and-sup-elements', 'the-summary-element', 'the-table-element',
      'the-tbody-element', 'the-td-element', 'the-textarea-element',
      'the-tfoot-element', 'the-th-element', 'the-thead-element',
      'the-time-element', 'the-title-element', 'the-tr-element',
      'the-track-element', 'the-u-element', 'the-ul-element',
      'the-var-element', 'the-video-element', 'the-wbr-element',

      'styling',
      'usage-summary',
      'attributes-common-to-ins-and-del-elements',
      'edits-and-paragraphs',
      'edits-and-lists',
      'media-elements',
      'image-maps',
      'mathml',
      'svg-0',
      'dimension-attributes',
      'attributes-common-to-td-and-th-elements',
      'examples',
      'common-input-element-apis',

      'global-attributes',
      'element-definitions',
      'common-dom-interfaces',
      'namespaces',
      'requirements-relating-to-bidirectional-algorithm-formatting-characters',
      'wai-aria',
      'interactions-with-xpath-and-xslt',
      'headings-and-sections',

      'dynamic-markup-insertion',
      'common-microsyntaxes', 'urls', # <-- infrastructure
      'elements', 'content-models', 'apis-in-html-documents', # <-- dom

      'attributes-common-to-form-controls',
      'textFieldSelection',
      'constraints',
      'form-submission',

      'common-idioms-without-dedicated-elements',

      'scripting-1', 'sections', 'grouping-content', 'text-level-semantics', 'edits',
      'embedded-content-1', 'tabular-data',
      'forms', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms',
      'interactive-elements', 'commands', # <-- semantics

      'predefined-vocabularies-0', 'converting-html-to-other-formats', # <-- microdata
      'origin-0', 'timers', 'offline', 'history', 'links', # <-- browsers
      'user-prompts',
      'system-state-and-capabilities',
      'dnd', # <-- editing
      'editing-apis',

      'parsing', 'tokenization', 'tree-construction', 'the-end', 'named-character-references', # <-- syntax
  ]
  if no_split_exceptions or minimal_split_exceptions: split_exceptions = []


  if verbose: print "Parsing..."

  # Parse document
  if use_html5lib_parser:
      parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
      doc = parser.parse(open(input), encoding='utf-8')
  else:
      parser = etree.HTMLParser(encoding='utf-8')
      doc = etree.parse(open(input), parser)

  if verbose: print "Splitting..."

  doctitle = doc.find('.//title').text

  if make_index_of_terms:
    # get all the nodes from the index of terms (if any) and save for later
    index_of_terms = doc.xpath("//*[@class='index-of-terms']//dl")

  # Extract the body from the source document
  original_body = doc.find('body')

  # Create an empty body, for the page content to be added into later
  default_body = etree.Element('body')
  if original_body.get('class'): default_body.set('class', original_body.get('class'))
  default_body.set('onload', 'fixBrokenLink();')
  original_body.getparent().replace(original_body, default_body)

  # Extract the header, so we can reuse it in every page
  header = original_body.find('.//*[@class="head"]')

  # Make a stripped-down version of it
  short_header = deepcopy(header)
  del short_header[1:]

  # Extract the items in the TOC (remembering their nesting depth)
  def extract_toc_items(items, ol, depth):
      for li in ol.iterchildren():
          for c in li.iterchildren():
              if c.tag == 'a':
                if c.get('href')[0] == '#':
                  items.append( (depth, c.get('href')[1:], c) )
              elif c.tag == 'ol':
                  extract_toc_items(items, c, depth+1)
  toc_items = []
  extract_toc_items(toc_items, original_body.find('.//ol[@class="toc"]'), 0)

  # Stuff for fixing up references:

  def get_page_filename(name):
      return '%s.html' % name

  # Finds all the ids and remembers which page they were on
  id_pages = {}
  def extract_ids(page, node):
      if node.get('id'):
          id_pages[node.get('id')] = page
      for e in node.findall('.//*[@id]'):
          id_pages[e.get('id')] = page

  # Updates all the href="#id" to point to page#id
  missing_warnings = set()
  def fix_refs(page, node):
      for e in node.findall('.//a[@href]'):
          if e.get('href')[0] == '#':
              id = e.get('href')[1:]
              if id in id_pages:
                  if id_pages[id] != page: # only do non-local links
                      e.set('href', '%s#%s' % (get_page_filename(id_pages[id]), id))
              else:
                  missing_warnings.add(id)

  def report_broken_refs():
      for id in sorted(missing_warnings):
          print "warning: can't find target for #%s" % id

  pages = [] # for saving all the output, so fix_refs can be called in a second pass

  # Iterator over the full spec's body contents
  child_iter = original_body.iterchildren()

  def add_class(e, cls):
      if e.get('class'):
          e.set('class', e.get('class') + ' ' + cls)
      else:
          e.set('class', cls)

  # Contents/intro page:

  page = deepcopy(doc)
  page_body = page.find('body')
  add_class(page_body, 'split index')

  # Keep copying stuff from the front of the source document into this
  # page, until we find the first heading that isn't class="no-toc"
  for e in child_iter:
      if e.getnext().tag == 'h2' and 'no-toc' not in (e.getnext().get('class') or '').split(' '):
          break
      page_body.append(e)

  pages.append( (index_page, page, 'Front cover') )

  # Section/subsection pages:

  def should_split(e):
      global in_semantics, in_semantics_seen_first
      
      if e.get("id") == "semantics":
          in_semantics = True
          return True
      if e.tag == 'h2':
          in_semantics = False
          return True
      if e.tag == "h3" and in_semantics and minimal_split_exceptions:
          if in_semantics_seen_first: return True
          in_semantics_seen_first = True
      if e.get('id') in split_exceptions: return True
      if e.tag == 'div' and e.get('class') == 'impl':
          c = e.getchildren()
          if len(c):
              if c[0].tag == 'h2': return True
              if c[0].tag == "h3" and in_semantics and minimal_split_exceptions: return True
              if c[0].get('id') in split_exceptions: return True
      return False

  def get_heading_text_and_id(e):
      if e.tag == 'div' and e.get('class') == 'impl':
          node = e.getchildren()[0]
      else:
          node = e
      title = re.sub('\s+', ' ', etree.tostring(node, method='text').strip())
      return title, node.get('id')

  for heading in child_iter:
      # Handle the heading for this section
      title, name = get_heading_text_and_id(heading)
      if name == index_page: name = 'section-%s' % name
      if verbose: print '  <%s> %s - %s' % (heading.tag, name, title)

      page = deepcopy(doc)
      page_body = page.find('body')
      add_class(page_body, 'split chapter')

      page.find('//title').text = title + u' \u2014 ' + doctitle

      # Add the header
      page_body.append(deepcopy(short_header))

      # Add the page heading
      page_body.append(deepcopy(heading))
      extract_ids(name, heading)

      # Keep copying stuff from the source, until we reach the end of the
      # document or find a header to split on
      e = heading
      while e.getnext() is not None and not should_split(e.getnext()):
          e = child_iter.next()
          extract_ids(name, e)
          page_body.append(deepcopy(e))

      pages.append( (name, page, title) )

  # Fix the links, and add some navigation:

  for i in range(len(pages)):
      name, doc, title = pages[i]

      fix_refs(name, doc)

      if name == index_page: continue # don't add nav links to the TOC page

      head = doc.find('head')

      nav = etree.Element('nav')
      nav.set('class', 'prev_next')
      nav.text = '\n   '
      nav.tail = '\n\n  '

      if i > 1:
          href = get_page_filename(pages[i-1][0])
          title = pages[i-1][2]
          a = etree.XML(u'<a href="%s">\u2190 %s</a>' % (href, title))
          a.tail = u' \u2013\n   '
          nav.append(a)
          link = etree.XML('<link href="%s" title="%s" rel="prev"/>' % (href, title))
          link.tail = '\n  '
          head.append(link)

      a = etree.XML('<a href="%s.html#contents">Table of contents</a>' % index_page)
      a.tail = '\n  '
      nav.append(a)
      link = etree.XML('<link href="%s.html#contents" title="Table of contents" rel="contents"/>' % index_page)
      link.tail = '\n  '
      head.append(link)

      if i != len(pages)-1:
          href = get_page_filename(pages[i+1][0])
          title = pages[i+1][2]
          a = etree.XML(u'<a href="%s">%s \u2192</a>' % (href, title))
          a.tail = '\n  '
          nav.append(a)
          a.getprevious().tail = u' \u2013\n   '
          link = etree.XML('<link href="%s" title="%s" rel="next"/>' % (href, title))
          link.tail = '\n  '
          head.append(link)

      # Add a subset of the TOC to each page:

      # Find the items that are on this page
      new_toc_items = [ (d, id, e) for (d, id, e) in toc_items if id_pages[id] == name ]
      if len(new_toc_items) > 1: # don't bother if there's only one item, since it looks silly
          # Construct the new toc <ol>
          new_toc = etree.XML(u'<ol class="toc"/>')
          cur_ol = new_toc
          cur_li = None
          cur_depth = 0
          # Add each item, reconstructing the nested <ol>s and <li>s to preserve
          # the nesting depth of each item
          for (d, id, e) in new_toc_items:
              while d > cur_depth:
                  if cur_li is None:
                      cur_li = etree.XML(u'<li/>')
                      cur_ol.append(cur_li)
                  cur_ol = etree.XML('<ol/>')
                  cur_li.append(cur_ol)
                  cur_li = None
                  cur_depth += 1
              while d < cur_depth:
                  cur_li = cur_ol.getparent()
                  cur_ol = cur_li.getparent()
                  cur_depth -= 1
              cur_li = etree.XML(u'<li/>')
              cur_li.append(deepcopy(e))
              cur_ol.append(cur_li)
          nav.append(new_toc)

      doc.find('body').insert(1, nav) # after the header

  if make_index_of_terms:
  # Write additional separate files for each term entry in the index of terms.
  # Each term entry should be a <dl> with an id attribute whose value is an id of
  # a <dfn>, with the string "_index" appended to it.
  # For now, the subdirectory for the files is hardcoded here as "index-of-terms".
    os.makedirs(os.path.join(output, "index-of-terms"))
    for term in index_of_terms:
    # the firstChild <dt> here is a name and link for the defining instance of
    # each index term; we don't need that in this context, so just remove it
        term.remove(term.find("./dt"))
        fix_refs('DUMMY', term)
        # we use the ID of the term as the base for the filename, minus the last six
        # characters ("_index")
        id = term.get("id")[:-6]
        f = open(os.path.join(output, "index-of-terms", id + ".html"), 'w')
        f.write(etree.tostring(term, pretty_print=True, method="html"))

  report_broken_refs()

  if verbose: print "Outputting..."

  # Bug 12539 - lxml incorrectly munges some of our named character
  # references, so we manually fix them up here.
  ncrs_to_fix = {
    "//*[@id='entity-LeftAngleBracket']//span": u"⟨",
    "//*[@id='entity-RightAngleBracket']//span": u"⟩"
  }
  def fixup_ncrs(doc):
    for selector in ncrs_to_fix:
      doc.xpath(selector)[0].text = ncrs_to_fix[selector]

  # Output all the pages
  for name, doc, title in pages:
      f = open(os.path.join(output, get_page_filename(name)), 'w')
  #    f.write("<!doctype html>\n")
      if use_html5lib_serialiser:
          if name == 'named-character-references':
              fixup_ncrs(doc)
          tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc)
          serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
          for text in serializer.serialize(tokens, encoding='utf-8'):
            f.write(text)
      else:
          f.write(etree.tostring(doc, pretty_print=False, method="html"))

  # Generate the script to fix broken links
  f = open('%s/fragment-links.js' % (output), 'w')
  links = ','.join("'%s':'%s'" % (k.replace("\\", "\\\\").replace("'", "\\'"), v) for (k,v) in id_pages.items())
  f.write('var fragment_links = { ' + re.sub(r"([^\x20-\x7f])", lambda m: "\\u%04x" % ord(m.group(1)), links) + ' };\n')
  f.write("""
  var fragid = window.location.hash.substr(1);
  if (!fragid) { /* handle section-foo.html links from the old multipage version, and broken foo.html from the new version */
      var m = window.location.pathname.match(/\/(?:section-)?([\w\-]+)\.html/);
      if (m) fragid = m[1];
  }
  var page = fragment_links[fragid];
  if (page) {
      window.location.replace(page+'.html#'+fragid);
  }
  """)

  if verbose: print "Done."
Example #10
0
    def generate_stage(self, i, stage):
        if i == 0:
            return tournament.create_group_stage_section(stage)
        if i in (1, 3):
            svg = playoffs_svg.create_gauntlet_svg(stage['matches'])
            image = tournament.svg_to_image(svg, alt="")
            return tournament.create_knockout_stage_section(stage, image=image)
        if i == 2:
            return tournament.create_bo2_group_stage_section(stage)
        if i == 4:
            return tournament.create_championship_points_section(stage)
        if i == 5:
            svg = playoffs_svg.create_single_elimination_svg(stage['matches'])
            image = tournament.svg_to_image(svg, alt="")
            return tournament.create_knockout_stage_section(stage, image=image)


if __name__ == '__main__':
    from html5lib.serializer import serialize
    import sys
    import yaml

    data = yaml.safe_load(sys.stdin)
    html = LMS2016Generator().generate(data)

    out = sys.stdout.buffer
    out.write(b"<!DOCTYPE html>\n")
    out.write(serialize(html, encoding='ascii', inject_meta_charset=False))
    out.write(b"\n")
        "code, pre { background: #f4f4f4; } "
        "pre, h2 { margin: 0; } "
        "ul { margin: 0 0 0 16px; padding: 8px 0; } "
        "ol { margin: 0 0 0 28px; padding: 8px 0; } "
        "li { margin: 0 0 8px; } ")
    style_node = document.createElement('style')
    style_node.setAttribute('type', 'text/css')
    style_node.appendChild(document.createTextNode(css))
    head_node.appendChild(style_node)
    new_page.appendChild(head_node)
    body_node = document.createElement('body')
    # This step processes Quora's HTML into a more lightweight and portable form.
    cleanup_tree(document, answer_node, body_node)
    new_page.appendChild(body_node)
    # Okay! Finally, save the HTML.
    walker = treewalkers.getTreeWalker('dom')(new_page)
    try:
        with open(args.output_dir + '/' + filename, 'wb', 0o600) as saved_page:
            saved_page.write(b'<!DOCTYPE html>')
            saved_page.write(
                serializer.serialize(new_page,
                                     'dom',
                                     'utf-8',
                                     omit_optional_tags=False))
    except IOError as error:
        print('[ERROR] Failed to save to file %s (%s)' %
              (filename, error.strerror),
              file=sys.stderr)

print('Done', file=sys.stderr)
 def testEntityNoResolve(self):
     doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
     tree = etree.fromstring(doc, parser = self.parser).getroottree()
     result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False,
                                   resolve_entities=False)
     self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
Example #13
0
report_broken_refs()

print "Outputting..."

# Output all the pages
for name, doc, title in pages:
    f = open('%s/%s' % (file_args[1], get_page_filename(name)), 'w')
    if w3c:
        f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">\n')
    else:
        pass # f.write('<!DOCTYPE html>\n')
    if use_html5lib_serialiser:
        tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc)
        serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
        for text in serializer.serialize(tokens, encoding='us-ascii'):
            if text != '<!DOCTYPE html>': # some versions of lxml emit this; get rid of it if so
                f.write(text)
    else:
        f.write(etree.tostring(doc, pretty_print=False, method="html"))

# Generate the script to fix broken links
#f = open('%s/fragment-links.js' % (file_args[1]), 'w')
#links = ','.join('"%s":"%s"' % (k.replace("\\", "\\\\").replace('"', '\\"'), v) for (k,v) in id_pages.items())
#f.write('var fragment_links = { ' + re.sub(r"([^\x20-\x7f])", lambda m: "\\u%04x" % ord(m.group(1)), links) + ' };\n')
#f.write("""
#var fragid = window.location.hash.substr(1);
#if (!fragid) { /* handle section-foo.html links from the old multipage version, and broken foo.html from the new version */
#    var m = window.location.pathname.match(/\/(?:section-)?([\w\-]+)\.html/);
#    if (m) fragid = m[1];
#}
Example #14
0
def main(input, output):
    if use_html5lib_parser or use_html5lib_serialiser:
        import html5lib
        import html5lib.serializer
        import html5lib.treewalkers

    if w3c:
        index_page = 'spec'
    else:
        index_page = 'index'

    # The document is split on all <h2> elements, plus the following specific elements
    # (which were chosen to split any pages that were larger than about 100-200KB, and
    # may need to be adjusted as the spec changes):
    split_exceptions = [
        'the-a-element',
        'the-abbr-element',
        'the-address-element',
        'the-area-element',
        'the-article-element',
        'the-aside-element',
        'the-audio-element',
        'the-b-element',
        'the-base-element',
        'the-bdi-element',
        'the-bdo-element',
        'the-blockquote-element',
        'the-body-element',
        'the-br-element',
        'the-button-element',
        'the-canvas-element',
        'the-caption-element',
        'the-cite-element',
        'the-code-element',
        'the-col-element',
        'the-colgroup-element',
        'the-command-element',
        'the-datalist-element',
        'the-dd-element',
        'the-del-element',
        'the-details-element',
        'the-dfn-element',
        'the-dir-element',
        'the-div-element',
        'the-dl-element',
        'the-dt-element',
        'the-em-element',
        'the-embed-element',
        'the-fieldset-element',
        'the-figcaption-element',
        'the-figure-element',
        'the-footer-element',
        'the-form-element',
        'the-h1-h2-h3-h4-h5-and-h6-elements',
        'the-head-element',
        'the-header-element',
        'the-hgroup-element',
        'the-hr-element',
        'the-html-element',
        'the-i-element',
        'the-iframe-element',
        'the-img-element',
        'the-input-element',
        'the-ins-element',
        'the-kbd-element',
        'the-keygen-element',
        'the-label-element',
        'the-legend-element',
        'the-li-element',
        'the-link-element',
        'the-map-element',
        'the-mark-element',
        'the-menu-element',
        'the-meta-element',
        'the-meter-element',
        'the-nav-element',
        'the-noscript-element',
        'the-object-element',
        'the-ol-element',
        'the-optgroup-element',
        'the-option-element',
        'the-output-element',
        'the-p-element',
        'the-param-element',
        'the-pre-element',
        'the-progress-element',
        'the-q-element',
        'the-rp-element',
        'the-rt-element',
        'the-ruby-element',
        'the-s-element',
        'the-samp-element',
        'the-script-element',
        'the-section-element',
        'the-select-element',
        'the-small-element',
        'the-source-element',
        'the-span-element',
        'the-strong-element',
        'the-style-element',
        'the-sub-and-sup-elements',
        'the-summary-element',
        'the-table-element',
        'the-tbody-element',
        'the-td-element',
        'the-textarea-element',
        'the-tfoot-element',
        'the-th-element',
        'the-thead-element',
        'the-time-element',
        'the-title-element',
        'the-tr-element',
        'the-track-element',
        'the-u-element',
        'the-ul-element',
        'the-var-element',
        'the-video-element',
        'the-wbr-element',
        'styling',
        'usage-summary',
        'attributes-common-to-ins-and-del-elements',
        'edits-and-paragraphs',
        'edits-and-lists',
        'media-elements',
        'image-maps',
        'mathml',
        'svg-0',
        'dimension-attributes',
        'attributes-common-to-td-and-th-elements',
        'examples',
        'common-input-element-apis',
        'global-attributes',
        'element-definitions',
        'common-dom-interfaces',
        'namespaces',
        'requirements-relating-to-bidirectional-algorithm-formatting-characters',
        'wai-aria',
        'interactions-with-xpath-and-xslt',
        'headings-and-sections',
        'dynamic-markup-insertion',
        'common-microsyntaxes',
        'urls',  # <-- infrastructure
        'elements',
        'content-models',
        'apis-in-html-documents',  # <-- dom
        'attributes-common-to-form-controls',
        'textFieldSelection',
        'constraints',
        'form-submission',
        'common-idioms-without-dedicated-elements',
        'scripting-1',
        'sections',
        'grouping-content',
        'text-level-semantics',
        'edits',
        'embedded-content-1',
        'tabular-data',
        'forms',
        'states-of-the-type-attribute',
        'number-state',
        'common-input-element-attributes',
        'the-button-element',
        'association-of-controls-and-forms',
        'interactive-elements',
        'commands',  # <-- semantics
        'predefined-vocabularies-0',
        'converting-html-to-other-formats',  # <-- microdata
        'origin-0',
        'timers',
        'offline',
        'history',
        'links',  # <-- browsers
        'user-prompts',
        'system-state-and-capabilities',
        'dnd',  # <-- editing
        'editing-apis',
        'parsing',
        'tokenization',
        'tree-construction',
        'the-end',
        'named-character-references',  # <-- syntax
    ]

    if verbose: print "Parsing..."

    # Parse document
    if use_html5lib_parser:
        parser = html5lib.html5parser.HTMLParser(
            tree=html5lib.treebuilders.getTreeBuilder('lxml'))
        doc = parser.parse(open(input), encoding='utf-8')
    else:
        parser = etree.HTMLParser(encoding='utf-8')
        doc = etree.parse(open(input), parser)

    if verbose: print "Splitting..."

    doctitle = doc.find('.//title').text

    if make_index_of_terms:
        # get all the nodes from the index of terms (if any) and save for later
        index_of_terms = doc.xpath("//*[@class='index-of-terms']//dl")

    # Absolutise some references, so the spec can be hosted elsewhere
    if absolute_uris:
        for a in ('href', 'src'):
            for t in ('link', 'script', 'img'):
                for e in doc.findall('//%s[@%s]' % (t, a)):
                    if e.get(a)[0] == '/':
                        e.set(a, 'http://www.whatwg.org' + e.get(a))
                    else:
                        e.set(
                            a,
                            'http://www.whatwg.org/specs/web-apps/current-work/'
                            + e.get(a))

    # Extract the body from the source document
    original_body = doc.find('body')

    # Create an empty body, for the page content to be added into later
    default_body = etree.Element('body')
    if original_body.get('class'):
        default_body.set('class', original_body.get('class'))
    default_body.set('onload', 'fixBrokenLink();')
    #if original_body.get('onload'): default_body.set('onload', 'fixBrokenLink(); %s' % original_body.get('onload'))
    original_body.getparent().replace(original_body, default_body)

    # Extract the header, so we can reuse it in every page
    header = original_body.find('.//*[@class="head"]')

    # Make a stripped-down version of it
    short_header = deepcopy(header)
    del short_header[4:]

    # Extract the items in the TOC (remembering their nesting depth)
    def extract_toc_items(items, ol, depth):
        for li in ol.iterchildren():
            for c in li.iterchildren():
                if c.tag == 'a':
                    if c.get('href')[0] == '#':
                        items.append((depth, c.get('href')[1:], c))
                elif c.tag == 'ol':
                    extract_toc_items(items, c, depth + 1)

    toc_items = []
    extract_toc_items(toc_items, original_body.find('.//ol[@class="toc"]'), 0)

    # Prepare the link-fixup script
    if not w3c:
        link_fixup_script = etree.XML('<script src="link-fixup.js"/>')
        doc.find('head')[-1].tail = '\n  '
        doc.find('head').append(link_fixup_script)
        link_fixup_script.tail = '\n  '

    # Stuff for fixing up references:

    def get_page_filename(name):
        return '%s.html' % name

    # Finds all the ids and remembers which page they were on
    id_pages = {}

    def extract_ids(page, node):
        if node.get('id'):
            id_pages[node.get('id')] = page
        for e in node.findall('.//*[@id]'):
            id_pages[e.get('id')] = page

    # Updates all the href="#id" to point to page#id
    missing_warnings = set()

    def fix_refs(page, node):
        for e in node.findall('.//a[@href]'):
            if e.get('href')[0] == '#':
                id = e.get('href')[1:]
                if id in id_pages:
                    if id_pages[id] != page:  # only do non-local links
                        e.set('href',
                              '%s#%s' % (get_page_filename(id_pages[id]), id))
                else:
                    missing_warnings.add(id)

    def report_broken_refs():
        for id in sorted(missing_warnings):
            print "warning: can't find target for #%s" % id

    pages = [
    ]  # for saving all the output, so fix_refs can be called in a second pass

    # Iterator over the full spec's body contents
    child_iter = original_body.iterchildren()

    def add_class(e, cls):
        if e.get('class'):
            e.set('class', e.get('class') + ' ' + cls)
        else:
            e.set('class', cls)

    # Contents/intro page:

    page = deepcopy(doc)
    page_body = page.find('body')
    add_class(page_body, 'split index')

    # Keep copying stuff from the front of the source document into this
    # page, until we find the first heading that isn't class="no-toc"
    for e in child_iter:
        if e.getnext().tag == 'h2' and 'no-toc' not in (
                e.getnext().get('class') or '').split(' '):
            break
        page_body.append(e)

    pages.append((index_page, page, 'Front cover'))

    # Section/subsection pages:

    def should_split(e):
        if e.tag == 'h2': return True
        if e.get('id') in split_exceptions: return True
        if e.tag == 'div' and e.get('class') == 'impl':
            c = e.getchildren()
            if len(c):
                if c[0].tag == 'h2': return True
                if c[0].get('id') in split_exceptions: return True
        return False

    def get_heading_text_and_id(e):
        if e.tag == 'div' and e.get('class') == 'impl':
            node = e.getchildren()[0]
        else:
            node = e
        title = re.sub('\s+', ' ', etree.tostring(node, method='text').strip())
        return title, node.get('id')

    for heading in child_iter:
        # Handle the heading for this section
        title, name = get_heading_text_and_id(heading)
        if name == index_page: name = 'section-%s' % name
        if verbose: print '  <%s> %s - %s' % (heading.tag, name, title)

        page = deepcopy(doc)
        page_body = page.find('body')
        add_class(page_body, 'split chapter')

        page.find('//title').text = title + u' \u2014 ' + doctitle

        # Add the header
        page_body.append(deepcopy(short_header))

        # Add the page heading
        page_body.append(deepcopy(heading))
        extract_ids(name, heading)

        # Keep copying stuff from the source, until we reach the end of the
        # document or find a header to split on
        e = heading
        while e.getnext() is not None and not should_split(e.getnext()):
            e = child_iter.next()
            extract_ids(name, e)
            page_body.append(deepcopy(e))

        pages.append((name, page, title))

    # Fix the links, and add some navigation:

    for i in range(len(pages)):
        name, doc, title = pages[i]

        fix_refs(name, doc)

        if name == index_page: continue  # don't add nav links to the TOC page

        head = doc.find('head')

        if w3c:
            nav = etree.Element('div')  # HTML 4 compatibility
        else:
            nav = etree.Element('nav')
        nav.set('class', 'prev_next')
        nav.text = '\n   '
        nav.tail = '\n\n  '

        if i > 1:
            href = get_page_filename(pages[i - 1][0])
            title = pages[i - 1][2]
            a = etree.XML(u'<a href="%s">\u2190 %s</a>' % (href, title))
            a.tail = u' \u2013\n   '
            nav.append(a)
            link = etree.XML('<link href="%s" title="%s" rel="prev"/>' %
                             (href, title))
            link.tail = '\n  '
            head.append(link)

        a = etree.XML('<a href="%s.html#contents">Table of contents</a>' %
                      index_page)
        a.tail = '\n  '
        nav.append(a)
        link = etree.XML(
            '<link href="%s.html#contents" title="Table of contents" rel="contents"/>'
            % index_page)
        link.tail = '\n  '
        head.append(link)

        if i != len(pages) - 1:
            href = get_page_filename(pages[i + 1][0])
            title = pages[i + 1][2]
            a = etree.XML(u'<a href="%s">%s \u2192</a>' % (href, title))
            a.tail = '\n  '
            nav.append(a)
            a.getprevious().tail = u' \u2013\n   '
            link = etree.XML('<link href="%s" title="%s" rel="next"/>' %
                             (href, title))
            link.tail = '\n  '
            head.append(link)

        # Add a subset of the TOC to each page:

        # Find the items that are on this page
        new_toc_items = [(d, id, e) for (d, id, e) in toc_items
                         if id_pages[id] == name]
        if len(
                new_toc_items
        ) > 1:  # don't bother if there's only one item, since it looks silly
            # Construct the new toc <ol>
            new_toc = etree.XML(u'<ol class="toc"/>')
            cur_ol = new_toc
            cur_li = None
            cur_depth = 0
            # Add each item, reconstructing the nested <ol>s and <li>s to preserve
            # the nesting depth of each item
            for (d, id, e) in new_toc_items:
                while d > cur_depth:
                    if cur_li is None:
                        cur_li = etree.XML(u'<li/>')
                        cur_ol.append(cur_li)
                    cur_ol = etree.XML('<ol/>')
                    cur_li.append(cur_ol)
                    cur_li = None
                    cur_depth += 1
                while d < cur_depth:
                    cur_li = cur_ol.getparent()
                    cur_ol = cur_li.getparent()
                    cur_depth -= 1
                cur_li = etree.XML(u'<li/>')
                cur_li.append(deepcopy(e))
                cur_ol.append(cur_li)
            nav.append(new_toc)

        doc.find('body').insert(1, nav)  # after the header

    if make_index_of_terms:
        # Write additional separate files for each term entry in the index of terms.
        # Each term entry should be a <dl> with an id attribute whose value is an id of
        # a <dfn>, with the string "_index" appended to it.
        # For now, the subdirectory for the files is hardcoded here as "index-of-terms".
        for term in index_of_terms:
            # the firstChild <dt> here is a name and link for the defining instance of
            # each index term; we don't need that in this context, so just remove it
            term.remove(term.find("./dt"))
            fix_refs('DUMMY', term)
            # we use the ID of the term as the base for the filename, minus the last six
            # characters ("_index")
            id = term.get("id")[:-6]
            f = open('%s/%s' % ("index-of-terms", id + ".html"), 'w')
            f.write(etree.tostring(term, pretty_print=True, method="html"))

    report_broken_refs()

    if verbose: print "Outputting..."

    # Output all the pages
    for name, doc, title in pages:
        f = open('%s/%s' % (output, get_page_filename(name)), 'w')
        #    f.write("<!doctype html>\n")
        if use_html5lib_serialiser:
            tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc)
            serializer = html5lib.serializer.HTMLSerializer(
                quote_attr_values=True, inject_meta_charset=False)
            for text in serializer.serialize(tokens, encoding='us-ascii'):
                f.write(text)
        else:
            f.write(etree.tostring(doc, pretty_print=False, method="html"))

    # Generate the script to fix broken links
    f = open('%s/fragment-links.js' % (output), 'w')
    links = ','.join("'%s':'%s'" %
                     (k.replace("\\", "\\\\").replace("'", "\\'"), v)
                     for (k, v) in id_pages.items())
    f.write('var fragment_links = { ' + re.sub(
        r"([^\x20-\x7f])", lambda m: "\\u%04x" % ord(m.group(1)), links) +
            ' };\n')
    f.write("""
  var fragid = window.location.hash.substr(1);
  if (!fragid) { /* handle section-foo.html links from the old multipage version, and broken foo.html from the new version */
      var m = window.location.pathname.match(/\/(?:section-)?([\w\-]+)\.html/);
      if (m) fragid = m[1];
  }
  var page = fragment_links[fragid];
  if (page) {
      window.location.replace(page+'.html#'+fragid);
  }
  """)

    if verbose: print "Done."
Example #15
0
def testEntityNoResolve(lxml_parser):
    doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
    tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
    result = serialize(tree, tree="lxml", omit_optional_tags=False,
                                  resolve_entities=False)
    assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
Example #16
0
 def testEntityXML(self):
     doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
     tree = etree.fromstring(doc, parser=self.parser).getroottree()
     result = serialize(tree, tree='lxml', omit_optional_tags=False)
     self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
 def testEntityReplacement(self):
     doc = u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
     tree = etree.fromstring(doc, parser = self.parser).getroottree()
     result = serializer.serialize(tree, tree=u"lxml", omit_optional_tags=False)
     self.assertEqual(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)