コード例 #1
0
 def save_doc(self, dselector,
              fname):  # really should refactor to use %20 etc escapes, etc
     from html5lib import serialize
     s = u'<!DOCTYPE html>' + \
         self._build_notice() + \
         '%s\n%s' % (serialize (self.d('head')[0], tree="lxml"), serialize (self.d('body')[0], tree="lxml")) + \
         '\n</html>\n'
     self.write(s, fname)
コード例 #2
0
 def save_doc (self, dselector, fname):  # really should refactor to use %20 etc escapes, etc
   from html5lib import serialize
   s = u'<!DOCTYPE html>' + \
       ie_shim + \
       self.notice % (map_file, self.map_line, dt()) + \
       '%s\n%s' % (serialize (self.d('head')[0], tree="lxml"), serialize (self.d('body')[0], tree="lxml")) + \
       '\n</html>\n'
   self.write (s, fname)
コード例 #3
0
ファイル: engine.py プロジェクト: jowolf/tlg
 def save_doc (self, dselector, fname):  # really should refactor to use %20 etc escapes, etc
   from html5lib import serialize
   #s = u'<!DOCTYPE html>' + \
   #    ie_shim + \
   s = config.preamble + \
       self._build_notice() + \
       '%s\n%s' % (serialize (self.d('head')[0], tree="lxml"), serialize (self.d('body')[0], tree="lxml")) + \
       '\n</html>\n'
   self.write (s, fname)
コード例 #4
0
ファイル: __init__.py プロジェクト: soasuk/toc
def table_of_contents(html, url='', anchor_type='stacked-number'):
    index = [0, 0, 0, 0, 0, 0]
    depth = 0

    toc_doc = getDOMImplementation().createDocument(None, 'ol', None)
    toc = ol = toc_doc.documentElement

    doc = html5lib.parse(html, treebuilder='dom', namespaceHTMLElements=False)
    for header in traverse_headings(doc.documentElement):
        nextdepth = int(header.nodeName[1])

        if nextdepth > depth:
            for i in range(nextdepth, 6):
                index[i - 1] = 0
                
            for i in range(depth, nextdepth):
                next_ol = toc_doc.createElement('ol')
                ol.appendChild(next_ol)
                ol = next_ol
        elif nextdepth < depth:
            for i in range(nextdepth, depth): ol = ol.parentNode
            
        depth = nextdepth
            
        index[depth - 1] += 1
        label = '-'.join([str(index[d]) for d in range(0, depth) if index[d]])

        li = toc_doc.createElement('li')
        a = toc_doc.createElement('a')
        a.setAttribute('href', '%s#header-%s' % (url, label))
        a.appendChild(doc.createTextNode(innerText(header)))
        li.appendChild(a)
        ol.appendChild(li)
        
        header.setAttribute('id', 'header-' + label)
        
        if anchor_type == 'following-marker':
            anchor = toc_doc.createElement('a')
            anchor.setAttribute('href', '#header-%s' % label)
            anchor.setAttribute('class', 'toc-anchor')
            anchor.appendChild(doc.createTextNode('#'))
            header.appendChild(anchor)
        else:
            anchor = toc_doc.createElement('a')
            anchor.setAttribute('href', '#header-%s' % label)
            anchor.setAttribute('class', 'toc-anchor')
            anchor.appendChild(doc.createTextNode(label))
            header.insertBefore(anchor, header.firstChild)
        

    ol = toc
    while not list(filter(lambda node: node.nodeName == 'li', ol.childNodes)) and list(filter(lambda node: node.nodeName == 'ol', ol.childNodes)):
        ol = list(filter(lambda node: node.nodeName == 'ol', ol.childNodes))[0]
    ol.setAttribute('class', 'toc')

    return html5lib.serialize(ol, 'dom'), html5lib.serialize(doc, 'dom')
コード例 #5
0
ファイル: __init__.py プロジェクト: youngrok/toc
def table_of_contents(html, url='', anchor_type='stacked-number'):
    index = [0, 0, 0, 0, 0, 0]
    depth = 0

    toc_doc = getDOMImplementation().createDocument(None, 'ol', None)
    toc = ol = toc_doc.documentElement

    doc = html5lib.parse(html, treebuilder='dom', namespaceHTMLElements=False)
    for header in traverse_headings(doc.documentElement):
        nextdepth = int(header.nodeName[1])

        if nextdepth > depth:
            for i in range(nextdepth, 6):
                index[i - 1] = 0
                
            for i in range(depth, nextdepth):
                next_ol = toc_doc.createElement('ol')
                ol.appendChild(next_ol)
                ol = next_ol
        elif nextdepth < depth:
            for i in range(nextdepth, depth): ol = ol.parentNode
            
        depth = nextdepth
            
        index[depth - 1] += 1
        label = '-'.join([str(index[d]) for d in range(0, depth) if index[d]])

        li = toc_doc.createElement('li')
        a = toc_doc.createElement('a')
        a.setAttribute('href', '%s#header-%s' % (url, label))
        a.appendChild(doc.createTextNode(innerText(header)))
        li.appendChild(a)
        ol.appendChild(li)
        
        header.setAttribute('id', 'header-' + label)
        
        if anchor_type == 'following-marker':
            anchor = toc_doc.createElement('a')
            anchor.setAttribute('href', '#header-%s' % label)
            anchor.setAttribute('class', 'toc-anchor')
            anchor.appendChild(doc.createTextNode('#'))
            header.appendChild(anchor)
        else:
            anchor = toc_doc.createElement('a')
            anchor.setAttribute('href', '#header-%s' % label)
            anchor.setAttribute('class', 'toc-anchor')
            anchor.appendChild(doc.createTextNode(label))
            header.insertBefore(anchor, header.firstChild)
        

    ol = toc
    while not list(filter(lambda node: node.nodeName == 'li', ol.childNodes)) and list(filter(lambda node: node.nodeName == 'ol', ol.childNodes)):
        ol = list(filter(lambda node: node.nodeName == 'ol', ol.childNodes))[0]
    ol.setAttribute('class', 'toc')

    return html5lib.serialize(ol, 'dom', quote_attr_values=True), html5lib.serialize(doc, 'dom', quote_attr_values=True)
コード例 #6
0
ファイル: sampuru.py プロジェクト: aleray/aa.sampuru
    def write(self, outfile):
        output = ""
        if self.xpath:
            p = self.tree.xpath(self.xpath)
            if p:
                output = "\n".join(html5lib.serialize(elt, tree="lxml", encoding="utf-8") for elt in p)
        else:
            output = html5lib.serialize(self.tree, tree="lxml", encoding="utf-8")

        open(outfile, 'wb').write(output)
コード例 #7
0
ファイル: bench_html.py プロジェクト: zachlewis2020/wpt
def bench_serialize(loops, fh, treebuilder):
    fh.seek(0)
    doc = html5lib.parse(fh, treebuilder=treebuilder, useChardet=False)

    range_it = range(loops)
    t0 = pyperf.perf_counter()

    for loops in range_it:
        html5lib.serialize(doc,
                           tree=treebuilder,
                           encoding="ascii",
                           inject_meta_charset=False)

    return pyperf.perf_counter() - t0
コード例 #8
0
ファイル: views.py プロジェクト: MikiasEphrem/cls
def parse_html(content):
    try:
        document = html5lib.parse(content, namespaceHTMLElements=False)
        if not document:
            # Could not parse
            return content

        for parent in document.findall(".//script/.."):
            for script in parent.findall("script"):
                replace_script(parent, script)
        # Because html5lib parses like a browser, it will
        # always create head and body tags if they are missing.
        head = document.find("head")
        SubElement(
            head,
            "script",
            attrib={
                "src": static(
                    "content/{filename}".format(filename=get_hashi_filename())
                )
            },
        )
        return html5lib.serialize(
            document,
            quote_attr_values="always",
            omit_optional_tags=False,
            minimize_boolean_attributes=False,
            use_trailing_solidus=True,
            space_before_trailing_solidus=False,
        )
    except html5lib.html5parser.ParseError:
        return content
コード例 #9
0
 def parse(self, response):
     correct_html = html5lib.serialize(html5lib.parse(response.body))
     selector = Selector(text=correct_html)
     page = UkrNetPage()
     links = selector.xpath("//a/@href").extract()
     images = selector.xpath("//img/@src").extract()
     corrected_images = []
     for img in images:
         correct_img = img
         if img.startswith('//'):
             correct_img = img.split('//')[1]
         corrected_images.append(correct_img)
     text = filter(self.is_str_empty, map(lambda s: s.strip(),
                                          [text.extract() for text in
                                           response.xpath("//*[not(self::script)]/text()")]))
     page['url'] = response.request.url
     page['fragment_img'] = corrected_images
     page['fragment_text'] = text
     ukr_net_url = "www.ukr.net"
     wrong_start = "www."
     new_links_list = []
     for link in links:
         if ukr_net_url in link:
             correct_url = link
             if link.startswith("//"):
                 correct_url = link.split('//')[1]
             if correct_url.startswith(wrong_start):
                 correct_url = "https://" + correct_url
             new_links_list.append(correct_url)
     yield page
     links = new_links_list[:20]
     for link in links:
         yield response.follow(link, callback=self.parse)
コード例 #10
0
ファイル: normalize.py プロジェクト: fazalmajid/temboz
def balance(html, limit_words=None, ellipsis=' ...'):
  if not limit_words:
    return html5lib.serialize(html5lib.parse(html))
  word_count = 0
  tokens = tag_re.split(html)
  out = []
  stack = []
  for token in tokens:
    if not token.startswith('<'):
      if limit_words and word_count > limit_words:
        break
      words = token.split()
      word_count += len(words)
      if limit_words and word_count > limit_words:
        crop = limit_words - word_count
        out.append(' '.join(words[:crop]) + ellipsis)
      else:
        out.append(token)
      continue
    if token.startswith('<!'): continue
    if token == ']]>': continue
    if not token.endswith('>'): continue # invalid
    element = token[1:-1].split()[0].lower()
    if not element: continue # invalid
    if element in banned:
      element = 'pre'
      token = '<pre>'

    if element.startswith('/'):
      element = element[1:]
      if element in banned:
        element = 'pre'
        token = '</pre>'
      if element in stack:
        top = None
        while stack and top != element:
          top = stack.pop()
          out.append('</%s>' % top)
        continue
      else:
        continue

    if element in block and stack and stack[-1] not in block:
      # close previous block if any
      for i in xrange(len(stack) - 1, -1, -1):
        if stack[i] in block: break
      stack, previous_block = stack[:i], stack[i:]
      previous_block.reverse()
      for tag in previous_block:
        out.append('</%s>' % tag)
      
    if element in closing and not token.endswith('/>'):
      stack.append(element)
    out.append(token)
  # flush the stack
  out.extend(['</%s>' % element for element in reversed(stack)])
  return ''.join(out)
コード例 #11
0
def parse_html(content):
    try:
        document = html5lib.parse(content, namespaceHTMLElements=False)

        if not document:
            # Could not parse
            return content

        # Because html5lib parses like a browser, it will
        # always create head and body tags if they are missing.
        head = document.find("head")

        # Use the makeelement method of the head tag here to ensure that we use the same
        # Element class for both. Depending on the system and python version we are on,
        # we may be using the C implementation or the pure python and a mismatch will cause an error.
        script_tag = head.makeelement("script", {"type": "text/javascript"})
        script_tag.text = INITIALIZE_HASHI_FROM_IFRAME

        head.insert(0, script_tag)
        # Currently, html5lib strips the doctype, but it's important for correct rendering, so check the original
        # content for the doctype and, if found, prepend it to the content serialized by html5lib
        doctype = None
        try:
            # Now parse the content as a dom tree instead, so that we capture
            # any doctype node as a dom node that we can read.
            tree_builder_dom = html5lib.treebuilders.getTreeBuilder("dom")
            parser_dom = html5lib.HTMLParser(tree_builder_dom,
                                             namespaceHTMLElements=False)
            tree = parser_dom.parse(content)
            # By HTML Spec if doctype is included, it must be the first thing
            # in the document, so it has to be the first child node of the document
            doctype_node = tree.childNodes[0]

            # Check that this node is in fact a doctype node
            if doctype_node.nodeType == doctype_node.DOCUMENT_TYPE_NODE:
                # render to a string by calling the toxml method
                # toxml uses single quotes by default, replace with ""
                doctype = doctype_node.toxml().replace("'", '"')
        except Exception as e:
            logger.warn(
                "Error in HTML5 parsing to determine doctype {}".format(e))

        html = html5lib.serialize(
            document,
            quote_attr_values="always",
            omit_optional_tags=False,
            minimize_boolean_attributes=False,
            use_trailing_solidus=True,
            space_before_trailing_solidus=False,
        )

        if doctype:
            html = doctype + html

        return html
    except html5lib.html5parser.ParseError:
        return content
コード例 #12
0
def balance(html, limit_words=None, ellipsis=' ...'):
    if not limit_words:
        return html5lib.serialize(html5lib.parse(html))
    word_count = 0
    tokens = tag_re.split(html)
    out = []
    stack = []
    for token in tokens:
        if not token.startswith('<'):
            if limit_words and word_count > limit_words:
                break
            words = token.split()
            word_count += len(words)
            if limit_words and word_count > limit_words:
                crop = limit_words - word_count
                out.append(' '.join(words[:crop]) + ellipsis)
            else:
                out.append(token)
            continue
        if token.startswith('<!'): continue
        if token == ']]>': continue
        if not token.endswith('>'): continue  # invalid
        element = token[1:-1].split()[0].lower()
        if not element: continue  # invalid
        if element in banned:
            element = 'pre'
            token = '<pre>'

        if element.startswith('/'):
            element = element[1:]
            if element in banned:
                element = 'pre'
                token = '</pre>'
            if element in stack:
                top = None
                while stack and top != element:
                    top = stack.pop()
                    out.append('</%s>' % top)
                continue
            else:
                continue

        if element in block and stack and stack[-1] not in block:
            # close previous block if any
            for i in range(len(stack) - 1, -1, -1):
                if stack[i] in block: break
            stack, previous_block = stack[:i], stack[i:]
            previous_block.reverse()
            for tag in previous_block:
                out.append('</%s>' % tag)

        if element in closing and not token.endswith('/>'):
            stack.append(element)
        out.append(token)
    # flush the stack
    out.extend(['</%s>' % element for element in reversed(stack)])
    return ''.join(out)
コード例 #13
0
def test_sanitizer(expected, input):
    parsed = parseFragment(expected)
    expected = serialize(parsed,
                         omit_optional_tags=False,
                         use_trailing_solidus=True,
                         space_before_trailing_solidus=False,
                         quote_attr_values="always",
                         quote_char='"',
                         alphabetical_attributes=True)
    assert expected == sanitize_html(input)
コード例 #14
0
ファイル: test_sanitizer.py プロジェクト: Coder206/servo
def runSanitizerTest(_, expected, input):
    parsed = parseFragment(expected)
    expected = serialize(parsed,
                         omit_optional_tags=False,
                         use_trailing_solidus=True,
                         space_before_trailing_solidus=False,
                         quote_attr_values="always",
                         quote_char='"',
                         alphabetical_attributes=True)
    assert expected == sanitize_html(input)
コード例 #15
0
ファイル: sanitizer.py プロジェクト: sebix/python-textile
def sanitize(string):
    """
    Ensure that the text does not contain any malicious HTML code which might
    break the page.
    """
    from html5lib import parseFragment, serialize

    parsed = parseFragment(string)
    clean = serialize(parsed, sanitize=True, omit_optional_tags=False,
                      quote_attr_values='always')
    return clean
コード例 #16
0
ファイル: test_sanitizer.py プロジェクト: Coder206/servo
def sanitize_html(stream):
    parsed = parseFragment(stream)
    serialized = serialize(parsed,
                           sanitize=True,
                           omit_optional_tags=False,
                           use_trailing_solidus=True,
                           space_before_trailing_solidus=False,
                           quote_attr_values="always",
                           quote_char='"',
                           alphabetical_attributes=True)
    return serialized
コード例 #17
0
def sanitize_html(stream):
    parsed = parseFragment(stream)
    serialized = serialize(parsed,
                           sanitize=True,
                           omit_optional_tags=False,
                           use_trailing_solidus=True,
                           space_before_trailing_solidus=False,
                           quote_attr_values="always",
                           quote_char='"',
                           alphabetical_attributes=True)
    return serialized
コード例 #18
0
    def _html_serialize(self, chunks, attributes, max_length):
        """Returns concatenated HTML code with SPAN tag.

    Args:
      chunks: The list of chunks to be processed. (ChunkList)
      attributes: If a dictionary, it should be a map of name-value pairs for
          attributes of output SPAN tags. If a string, it should be a class name
          of output SPAN tags. If an array, it should be a list of class names
          of output SPAN tags. (str or dict or list of str)
      max_length: Maximum length of span enclosed chunk. (int, optional)      

    Returns:
      The organized HTML code. (str)
    """
        doc = ET.Element('span')
        for chunk in chunks:
            if chunk.is_space():
                if doc.getchildren():
                    if doc.getchildren()[-1].tail is None:
                        doc.getchildren()[-1].tail = ' '
                    else:
                        doc.getchildren()[-1].tail += ' '
                else:
                    if doc.text is not None:
                        # We want to preserve space in cases like "Hello 你好"
                        # But the space in " 你好" can be discarded.
                        doc.text += ' '
            else:
                if chunk.has_cjk() and not (max_length
                                            and len(chunk.word) > max_length):
                    ele = ET.Element('span')
                    ele.text = chunk.word
                    for k, v in attributes.items():
                        ele.attrib[k] = v
                    doc.append(ele)
                else:
                    # add word without span tag for non-CJK text (e.g. English)
                    # by appending it after the last element
                    if doc.getchildren():
                        if doc.getchildren()[-1].tail is None:
                            doc.getchildren()[-1].tail = chunk.word
                        else:
                            doc.getchildren()[-1].tail += chunk.word
                    else:
                        if doc.text is None:
                            doc.text = chunk.word
                        else:
                            doc.text += chunk.word
        result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
        result = html5lib.serialize(html5lib.parseFragment(result),
                                    sanitize=True,
                                    quote_attr_values="always")
        return result
コード例 #19
0
def sanitize(string):
    """
    Ensure that the text does not contain any malicious HTML code which might
    break the page.
    """
    from html5lib import parseFragment, serialize

    parsed = parseFragment(string)
    clean = serialize(parsed,
                      sanitize=True,
                      omit_optional_tags=False,
                      quote_attr_values='always')
    return clean
コード例 #20
0
ファイル: zip.py プロジェクト: socketbox/studio
def parse_html(content):
    try:
        document = html5lib.parse(content, namespaceHTMLElements=False)

        if not document:
            # Could not parse
            return content

        # Because html5lib parses like a browser, it will
        # always create head and body tags if they are missing.
        head = document.find("head")
        for file in get_files("htmlScreenshot", "js"):
            SubElement(head, "script", attrib={"src": file['url']})
        # Currently, html5lib strips the doctype, but it's important for correct rendering, so check the original
        # content for the doctype and, if found, prepend it to the content serialized by html5lib
        doctype = None
        try:
            # Now parse the content as a dom tree instead, so that we capture
            # any doctype node as a dom node that we can read.
            tree_builder_dom = html5lib.treebuilders.getTreeBuilder("dom")
            parser_dom = html5lib.HTMLParser(tree_builder_dom,
                                             namespaceHTMLElements=False)
            tree = parser_dom.parse(content)
            # By HTML Spec if doctype is included, it must be the first thing
            # in the document, so it has to be the first child node of the document
            doctype_node = tree.childNodes[0]

            # Check that this node is in fact a doctype node
            if doctype_node.nodeType == doctype_node.DOCUMENT_TYPE_NODE:
                # render to a string by calling the toxml method
                # toxml uses single quotes by default, replace with ""
                doctype = doctype_node.toxml().replace("'", '"')
        except Exception as e:
            logging.warn(
                "Error in HTML5 parsing to determine doctype {}".format(e))

        html = html5lib.serialize(
            document,
            quote_attr_values="always",
            omit_optional_tags=False,
            minimize_boolean_attributes=False,
            use_trailing_solidus=True,
            space_before_trailing_solidus=False,
        )

        if doctype:
            html = doctype + html

        return html
    except html5lib.html5parser.ParseError:
        return content
コード例 #21
0
ファイル: engine.py プロジェクト: jowolf/tlg
 def save_template_once (self, dselector, fname):
   "Save partial template for later editing; don't overwrite!"
   from html5lib import serialize
   ext = fname.lower().split ('.') [-1]
   #s = dselector.outerHtml()
   s = serialize (dselector, tree="lxml")
   if ext == 'minaml':
     s = convert_to_shpaml (s)
   elif ext == 'html':
     pass # s = s
   else:
     print 'NYI'  # TODO: compiled html template, yaml for obdject / declaration
   if trace: print 'SAVING ONCE:', fname
   self.write (s, fname, overwrite=False)
コード例 #22
0
 def save_template_once(self, dselector, fname):
     "Save partial template for later editing; don't overwrite!"
     from html5lib import serialize
     ext = fname.lower().split('.')[-1]
     #s = dselector.outerHtml()
     s = serialize(dselector, tree="lxml")
     if ext == 'minaml':
         s = convert_to_shpaml(s)
     elif ext == 'html':
         pass  # s = s
     else:
         print 'NYI'  # TODO: compiled html template, yaml for obdject / declaration
     if trace: print 'SAVING ONCE:', fname
     self.write(s, fname, overwrite=False)
コード例 #23
0
def markdown(value):
    # Renders the string using CommonMark in safe mode, which blocks
    # raw HTML in the input and also some links using a blacklist,
    # plus a second pass filtering using a whitelist for allowed
    # tags and URL schemes.

    import CommonMark
    ast = CommonMark.Parser().parse(force_unicode(value))
    html = CommonMark.HtmlRenderer({'safe': True}).render(ast)

    import html5lib, urlparse

    def filter_url(url):
        try:
            urlp = urlparse.urlparse(url)
        except Exception as e:
            # invalid URL
            return None
        if urlp.scheme not in ("http", "https"):
            return None
        return url

    valid_tags = set(
        'strong em a code p h1 h2 h3 h4 h5 h6 pre br hr img ul ol li span blockquote'
        .split())
    valid_tags = set('{http://www.w3.org/1999/xhtml}' + tag
                     for tag in valid_tags)
    dom = html5lib.HTMLParser().parseFragment(html)
    for node in dom.iter():
        if node.tag not in valid_tags and node.tag != 'DOCUMENT_FRAGMENT':
            node.tag = '{http://www.w3.org/1999/xhtml}span'
        for name, val in node.attrib.items():
            if name.lower() in ("href", "src"):
                val = filter_url(val)
                if val is None:
                    node.attrib.pop(name)
                else:
                    node.set(name, val)
            else:
                # No other attributes are permitted.
                node.attrib.pop(name)
    html = html5lib.serialize(dom,
                              quote_attr_values="always",
                              omit_optional_tags=False,
                              alphabetical_attributes=True)

    return safestring.mark_safe(html)
コード例 #24
0
ファイル: sanitizer.py プロジェクト: twm/html5iter
    def runtest(self):
        input = self.test["input"]
        expected = self.test["output"]

        parsed = parseFragment(input)
        serialized = serialize(parsed,
                               sanitize=True,
                               omit_optional_tags=False,
                               use_trailing_solidus=True,
                               space_before_trailing_solidus=False,
                               quote_attr_values="always",
                               quote_char="'",
                               alphabetical_attributes=True)
        errorMsg = "\n".join(["\n\nInput:", input,
                              "\nExpected:", expected,
                              "\nReceived:", serialized])
        assert expected == serialized, errorMsg
コード例 #25
0
ファイル: sanitizer.py プロジェクト: Coder206/servo
    def runtest(self):
        input = self.test["input"]
        expected = self.test["output"]

        parsed = parseFragment(input)
        serialized = serialize(parsed,
                               sanitize=True,
                               omit_optional_tags=False,
                               use_trailing_solidus=True,
                               space_before_trailing_solidus=False,
                               quote_attr_values="always",
                               quote_char="'",
                               alphabetical_attributes=True)
        errorMsg = "\n".join(["\n\nInput:", input,
                              "\nExpected:", expected,
                              "\nReceived:", serialized])
        assert expected == serialized, errorMsg
コード例 #26
0
def build_page(filepath: str) -> None:
    with open(filepath, "rb") as f:
        document: et.Element = html5lib.parse(f)

    name, _ = os.path.splitext(os.path.split(filepath)[1])
    page = Page.objects.get(name=name)

    page.order = int(select('meta[name=sfs_order]', document).get_attr('content') or 0)
    page.active = parse_bool_attr_value(select('meta[name=sfs_active]', document).get_attr('content'))
    page.icon = select('meta[name=sfs_icon]', document).get_attr('content') or ''
    page.title = select('title', document).text

    parent_name = select('meta[name=sfs_parent]', document).get_attr('content')
    if parent_name:
        parent_page = Page.objects.get(name=parent_name)
        page.parent = parent_page

    body = select('body', document).get(0)
    page.content = html5lib.serialize(body)

    page.save()
コード例 #27
0
 def parse(self, response):
     correct_html = html5lib.serialize(html5lib.parse(response.body))
     selector = Selector(text=correct_html)
     products = selector.xpath(self.selectors['product'])[:20]
     for prod in products:
         item = RepkaItem()
         name = prod.xpath(self.selectors['product-name']).extract()[0]
         url = prod.xpath(self.selectors['product-url']).extract()[0]
         price = prod.xpath(self.selectors['product-price']).extract()[0]
         image = prod.xpath(self.selectors['product-image']).extract()[0]
         descr_titles = prod.xpath(
             self.selectors["product-description-title"]).extract()
         descr_info = prod.xpath(
             self.selectors["product-description-info"]).extract()
         descr_info = list(filter(self.is_str_empty, descr_info))
         item['name'] = name
         item['url'] = url
         item['price'] = price
         item['image'] = image
         item['description_titles'] = descr_titles
         item['description_info'] = descr_info
         yield item
コード例 #28
0
    def html_serialize(self, attributes, max_length=None):
        """Returns concatenated HTML code with SPAN tag.

    Args:
      attributes (dict): A map of name-value pairs for attributes of output
          SPAN tags.
      max_length (:obj:`int`, optional): Maximum length of span enclosed chunk.

    Returns:
      The organized HTML code. (str)
    """
        doc = ET.Element('span')
        for chunk in self:
            if (chunk.has_cjk()
                    and not (max_length and len(chunk.word) > max_length)):
                ele = ET.Element('span')
                ele.text = chunk.word
                for key, val in attributes.items():
                    ele.attrib[key] = val
                doc.append(ele)
            else:
                # add word without span tag for non-CJK text (e.g. English)
                # by appending it after the last element
                if doc.getchildren():
                    if doc.getchildren()[-1].tail is None:
                        doc.getchildren()[-1].tail = chunk.word
                    else:
                        doc.getchildren()[-1].tail += chunk.word
                else:
                    if doc.text is None:
                        doc.text = chunk.word
                    else:
                        doc.text += chunk.word
        result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
        result = html5lib.serialize(html5lib.parseFragment(result),
                                    sanitize=True,
                                    quote_attr_values='always')
        return result
コード例 #29
0
 def _inject_session(self, session, html):
     serialized = session.serialize()
     def _walk(node):
         for child in node.childNodes:
             _walk(child)
         if node.name in self.session_url_rewrite_map:
             for attr in self.session_url_rewrite_map[node.name]:
                 value = node.attributes.get(attr)
                 if value is None:
                     continue
                 new_value = self._rewrite_session_url(value, serialized)
                 node.attributes[attr] = new_value
         elif node.name == 'form':
             hidden = Element('input')
             hidden.attributes.update(
                 type='hidden',
                 name=self.session_url_key,
                 value=serialized
             )
             node.childNodes.append(hidden)
     tree = _parser.parse(html)
     _walk(tree)
     return serialize(tree)
コード例 #30
0
    def render_content(self, content_text: str) -> str:
        document: et.Element = html5lib.parse(content_text)
        body = select('body', document).get(0)

        # Process src attributes
        for element in select('*[data-sfs-src]', body):
            src = element.get_attr('src')
            mode = element.get_attr('data-sfs-src')

            if mode == 'static':
                element.set_attr('src', static(src))
            elif mode == 'file':
                element.set_attr('src', file(src))

            element.del_attr('data-sfs-src')

        # Process anchor hrefs
        for a in select('a[data-sfs-href]', body):
            name = a.get_attr('href')
            a.set_attr('href', reverse('page', kwargs={'name': name}))
            a.del_attr('data-sfs-href')

        return html5lib.serialize(body)
コード例 #31
0
def markdown(value):
    # Renders the string using CommonMark in safe mode, which blocks
    # raw HTML in the input and also some links using a blacklist,
    # plus a second pass filtering using a whitelist for allowed
    # tags and URL schemes.

    import cmarkgfm
    from cmarkgfm.cmark import Options as cmarkgfmOptions

    html = cmarkgfm.github_flavored_markdown_to_html(
        value, options=cmarkgfmOptions.CMARK_OPT_SAFE)

    import html5lib, urllib.parse

    def filter_url(url):
        try:
            urlp = urllib.parse.urlparse(url)
        except Exception as e:
            # invalid URL
            return None
        if urlp.scheme not in ("http", "https"):
            return None
        return url

    valid_tags = set(
        'strong em a code p h1 h2 h3 h4 h5 h6 pre br hr img ul ol li span blockquote'
        .split())
    valid_tags = set('{http://www.w3.org/1999/xhtml}' + tag
                     for tag in valid_tags)
    dom = html5lib.HTMLParser().parseFragment(html)
    for node in dom.iter():
        if node.tag not in valid_tags and node.tag != 'DOCUMENT_FRAGMENT':
            node.tag = '{http://www.w3.org/1999/xhtml}span'
        for name, val in list(node.attrib.items()):
            if name.lower() in ("href", "src"):
                val = filter_url(val)
                if val is None:
                    node.attrib.pop(name)
                else:
                    node.set(name, val)
            else:
                # No other attributes are permitted.
                node.attrib.pop(name)

    # If there is an h1 in the output, demote all of the headings
    # so we don't create something that interfere's with the page h1.
    hash1 = False
    for node in dom.iter():
        if node.tag in ("h1", "{http://www.w3.org/1999/xhtml}h1"):
            hash1 = True
    if hash1:
        for node in dom.iter():
            m = re.match("(\{http://www.w3.org/1999/xhtml\})?h(\d)$", node.tag)
            if m:
                node.tag = (m.group(1) or "") + "h" + str(int(m.group(2)) + 1)

    html = html5lib.serialize(dom,
                              quote_attr_values="always",
                              omit_optional_tags=False,
                              alphabetical_attributes=True)

    return safestring.mark_safe(html)
コード例 #32
0
ファイル: engine.py プロジェクト: jowolf/tlg
 def _save (self, content, fname, overwrite=True):
   from html5lib import serialize
   s = serialize (content, tree="lxml")
   self.content = s
   self.write (s, fname, overwrite)
コード例 #33
0
ファイル: post.py プロジェクト: lttxzmj/envision
def render_sanitized_html(html):
    parser = HTMLParser(tokenizer=HTMLSanitizer)
    etree = parser.parse(html)
    return serialize(etree)
コード例 #34
0
 def _save(self, content, fname, overwrite=True):
     from html5lib import serialize
     s = serialize(content, tree="lxml")
     self.content = s
     self.write(s, fname, overwrite)