Esempio n. 1
0
 def parse(self, content):
     failed = False
     p = html5lib.HTMLParser(tree=etree.TreeBuilder)
     try:
         tree = p.parse(content)
     except:
         self.buggyURLs.add(self.currentURL)
         failed = True
         print("BUGGY:", self.currentURL)
     self.visitedURLs.add(self.currentURL)
     if not failed:
         self.updateURLs(tree)
Esempio n. 2
0
 def test_sanitizer(self):
     allowed_attrs = html5lib.sanitizer.HTMLSanitizer.allowed_attributes[:]
     sanitizer.TextSanitizer.allow_token_parsers = (
         attribute_parsers.DataAttributeParser, )
     parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                                  tokenizer=sanitizer.TextSanitizer)
     body = '<span data-one="1" data-two="2">some text</span>'
     body = html.clean_html(body, full=False, parser=parser)
     self.assertTrue('data-one="1"' in body)
     self.assertTrue('data-two="2"' in body)
     self.assertEqual(allowed_attrs,
                      html5lib.sanitizer.HTMLSanitizer.allowed_attributes)
Esempio n. 3
0
    def assertValidHTML(self, content, msg=None):
        parser = html5lib.HTMLParser()
        parser.parseFragment(self.panel.content)
        if parser.errors:
            default_msg = ['Content is invalid HTML:']
            lines = content.split('\n')
            for position, errorcode, datavars in parser.errors:
                default_msg.append('  %s' % html5lib.constants.E[errorcode] % datavars)
                default_msg.append('    %s' % lines[position[0] - 1])

            msg = self._formatMessage(msg, '\n'.join(default_msg))
            raise self.failureException(msg)
def get_document_for(srcname):
    document = None # an xml.dom.minidom document
    if is_html(srcname):
        # An HTML file
        f = open(srcname, "rb")
        parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
        document = parser.parse(f)
        f.close()
    else:
        # An XML file
        document = xml.dom.minidom.parse(srcname)
    return document
Esempio n. 5
0
  def get_toc(self, path):

    # Only have TOC on tutorial pages. Don't do work for others.
    if not (re.search('/tutorials', path) or re.search('/mobile', path) or re.search('style-guide', path)):
      return ''

    toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
    if toc is None or not self.request.cache:
      template_text = render_to_string(path, {})

      parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
      dom_tree = parser.parse(template_text)
      walker = treewalkers.getTreeWalker("dom")
      stream = walker(dom_tree)
      toc = []
      current = None
      innerTagCount = 0
      for element in stream:
        if element['type'] == 'StartTag':
          if element['name'] in ['h2']:
            for attr in element['data']:
              if attr[0] == 'id':
                current = {
                  'level' : int(element['name'][-1:]) - 1,
                  'id' : attr[1],
                  'text': ''
                }
          elif current is not None:
            innerTagCount += 1
        elif element['type'] == 'Characters' and current is not None:

          # if we already have text check:
          # - whether the last character is a < or a (
          # - the string being added starts with > or )
          # in which case do not add a space
          if current['text'] != '':

            if current['text'][-1] != '<' and not re.match(r"^[\>\)]", element['data']):
              current['text'] += ' '

          current['text'] = current['text'] + element['data']

        elif element['type'] == 'EndTag' and current is not None:
          if innerTagCount > 0:
            innerTagCount -= 1
          else:
            current['text'] = cgi.escape(current['text'])
            toc.append(current)
            current = None

      memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600)

    return toc
Esempio n. 6
0
    def parse(self, live=False):
        """ 
		Parse historical urls for a web page over years.
		We first determine the year scale that has valid snapshots.
		@Return: list of historical urls or None
		"""
        self._parse_called = True

        wayback_page_whole = self.open_url(self.get_wayback_page(self.url))
        if wayback_page_whole == None: return None

        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
        html_doc = parser.parse(wayback_page_whole)

        position_div = html_doc.find("./{*}body/{*}div[@id='position']")
        sketchinfo = position_div.find(
            "./{*}div[@id='wbSearch']/{*}div[@id='form']/{*}div[@id='wbMeta']/{*}p[@class='wbThis']"
        )
        first_url = sketchinfo.getchildren()[-1].attrib['href']
        first_year = self.extract_year(first_url)

        for year in range(first_year, datetime.datetime.now().year + 1):
            # Be polite to the host server
            time.sleep(random.randint(1, 3))

            # Note: the timestamp in search url indicates the time scale of query:
            # E.g., wildcard * matches all of the items in specific year.
            # If only * is supported, the results of latest year are returned.
            # I found that it returned wrong results if the month and day numbers are small like 0101,
            # so a bigger number is used to match wildly.
            wayback_page_year = "%s/%d0601000000*/%s" % (self.prefix, year,
                                                         self.url)
            page_year, his_urls = self._parse_wayback_page(wayback_page_year)

            # To exclude duplicated items that don't match the year
            # By default the results of latest year are returned
            # if some year hasn't been crawled
            if page_year == None or page_year != year: continue
            module_logger.debug("%s: %d pages found for year %d" %
                                (self.url, len(his_urls), page_year))

            for url in his_urls:
                try:
                    page_year = self.extract_year(url)
                except:
                    module_logger.error(
                        "Invalid timestamp of wayback url: %s" % url)
                    continue
                if year == page_year:
                    if live: self.add_item(year, self.convert_live_url(url))
                    else: self.add_item(year, url)

        return self.results
Esempio n. 7
0
def get_items(location, encoding=None):
    """
    Pass in a string or file-like object and get a list of Items present in the
    HTML document.
    """
    dom_builder = html5lib.treebuilders.getTreeBuilder("dom")
    parser = html5lib.HTMLParser(tree=dom_builder)
    if encoding:
        tree = parser.parse(location, transport_encoding=encoding)
    else:
        tree = parser.parse(location)
    return _find_items(tree)
Esempio n. 8
0
def get_document_for(fn, spec):
    document = None # an xml.dom.minidom document
    if fn.endswith(".htm") or fn.endswith(".html"):
        # An HTML file
        f = open(fn, "r")
        parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
        document = parser.parse(f)
        f.close()
    else:
        # An XML file
        document = xml.dom.minidom.parse(fn)
    return document
Esempio n. 9
0
def _html5lib_parser():
    """
    html5lib is a pure-python library that conforms to the WHATWG HTML spec
    and is not vulnarable to certain attacks common for XML libraries
    """
    return html5lib.HTMLParser(
        # build lxml tree
        html5lib.treebuilders.getTreeBuilder("lxml"),
        # remove namespace value from inside lxml.html.html5paser element tag
        # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
        # instead of "div", throwing the algo off
        namespaceHTMLElements=False)
Esempio n. 10
0
    def parser(self):
        """
      Here I use html5lib so parse the pages retrieved.
      I am using BeautifulSoup as my parser here and I know it 
      is deprecated.  I will change this soon...

      Content is taken only from <p> tags, so this could be a lot
      more robust. 

      All words are stemmed and stopwords are removed.
    """

        #get stopwords; remove newline char
        parsed_html = {}
        stopwords = [word[:-1] for word in open('stopwords.txt')]
        pstemmer = stemmer.PorterStemmer()

        htmldocs = os.listdir('pages/')  #grap all html docs and parse them
        words_splitter = re.compile(r'\W*')  #split on non words
        for htmldoc in htmldocs:
            f = open('pages/' + htmldoc, 'r')
            link = f.readline()
            html = f.readlines()

            try:
                print htmldoc
                p = html5lib.HTMLParser(
                    tree=treebuilders.getTreeBuilder('beautifulsoup'))
                tree = p.parse(html)
            except:
                os.remove(os.path.join('pages', htmldoc))
                print 'error parsing %s' % htmldoc
                continue

            title = tree.findAll('title')
            if title: title = title[0].text
            else: title = ''

            #grab text from p tags
            data = [p.text.lower() for p in tree.findAll('p')]
            #remove stopwords
            unstemmed_words = [
                word for word in words_splitter.split(''.join(data))
                if word != '' and word not in stopwords
            ]
            stemmed_words = [
                pstemmer.stem(word, 0,
                              len(word) - 1) for word in unstemmed_words
            ]
            parsed_html[(title, int(htmldoc), link)] = stemmed_words

        return parsed_html
def test(request):
    if request.method == "POST":
        text = request.POST['text']
        html5parser = html5lib.HTMLParser(strict=True)
        try:
            ans_text = html5parser.parse(text)
        except:
            ans_text = False
        if ans_text:
            return HttpResponse(text)
        else:
            return HttpResponse("Error")
    return render(request,'index.html')
Esempio n. 12
0
def _selectors(html):
    htmlfile = StringIO.StringIO(html)
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    doc = parser.parse(htmlfile, encoding='utf8')
    nodes = _child_elements(doc)
    result = set()
    while nodes:
        node = nodes.pop()
        result.add(node.nodeName)
        for attr in node.attributes.keys():
            result.add("%s[%s]" % (node.nodeName, attr))
        nodes.extend(_child_elements(node))
    return result
Esempio n. 13
0
def sanitize_html(html):
    """Sanitizes an HTML fragment.
    from forbidden markup
    """
    p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
                            tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return ''.join(output_generator)
Esempio n. 14
0
    def parse_html_string(self, html_str):
        """Parse the given HTML string to a XML DOM tree.

        Args:
          html_str: string. The HTML document to be parsed.

        Returns:
          An ElementTree representation of the DOM.
        """
        parser = html5lib.HTMLParser(
            tree=html5lib.treebuilders.getTreeBuilder('etree', cElementTree),
            namespaceHTMLElements=False)
        return parser.parse(html_str)
Esempio n. 15
0
 def test_html5_validation(self):
     response = self.client.get("/regular/HTML5/")
     parser = html5lib.HTMLParser()
     content = response.content
     parser.parse(content)
     if parser.errors:
         default_msg = ["Content is invalid HTML:"]
         lines = content.split(b"\n")
         for position, errorcode, datavars in parser.errors:
             default_msg.append("  %s" % html5lib.constants.E[errorcode] % datavars)
             default_msg.append("    %r" % lines[position[0] - 1])
         msg = self._formatMessage(None, "\n".join(default_msg))
         raise self.failureException(msg)
Esempio n. 16
0
    def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
Esempio n. 17
0
def getSoup(spec_url, hset, domain):
    if domain not in spec_url:
        url = domain + spec_url
    else:
        url = spec_url
    data = URLGRABBER.urlread(url)

    parser = html5lib.HTMLParser(
        tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(data)

    #soup = Soup.BeautifulSoup(data)
    return soup
Esempio n. 18
0
def get_components_using_html5lib(html):
    """Find lesson components using the pure python html5lib library."""

    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder(
        'etree', cElementTree),
                                 namespaceHTMLElements=False)
    content = parser.parseFragment('<div>%s</div>' % html)[0]
    components = []
    for component in content.findall('.//*[@instanceid]'):
        component_dict = {'cpt_name': component.tag}
        component_dict.update(component.attrib)
        components.append(component_dict)
    return components
def test_with_serializer():
    """Verify filter works in the context of everything else"""
    parser = html5lib.HTMLParser()
    dom = parser.parseFragment(
        '<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
    walker = html5lib.getTreeWalker('etree')
    ser = HTMLSerializer(alphabetical_attributes=True,
                         quote_attr_values='always')

    # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
    # that gets fixed, we can fix this expected result.
    assert (ser.render(walker(dom)) ==
            '<svg><pattern id="patt1" href="#patt2"></pattern></svg>')
Esempio n. 20
0
def get_xpaths(assembly_id):
    url = "http://ko.wikipedia.org/wiki/대한민국_제%s대_국회의원_목록"\
            % assembly_id
    p = html5lib.HTMLParser(\
            tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
            namespaceHTMLElements=False)
    r = urllib2.Request(url)
    r.add_header("User-Agent", settings["USER_AGENT"])
    f = urllib2.urlopen(r)

    page = p.parse(f)
    xpaths = page.xpath(settings["X_PATH"])
    return xpaths
Esempio n. 21
0
def embed(rawhtml, outfile, rootdirs=(serve.cwd, )):
    parser = html5lib.HTMLParser(
        tree=html5lib.treebuilders.getTreeBuilder("dom"))
    dom = parser.parse(rawhtml)
    head = dom.getElementsByTagName("head")[0]
    wurl = dom.createElement("script")
    wurl.setAttribute("type", "text/javascript")
    wurl.appendChild(
        dom.createTextNode('''
if (window.webkitURL)
    window.URL = window.webkitURL;
'''))
    head.insertBefore(wurl, head.childNodes[0])

    for script in dom.getElementsByTagName("script"):
        stype = script.getAttribute("type")
        if stype == "text/html":
            sdom = parser.parse(script.childNodes[0].wholeText)
            _embed_images(sdom, rootdirs)
            while len(script.childNodes) > 0:
                for c in script.childNodes:
                    script.removeChild(c)

            shtml = ""
            for el in sdom.getElementsByTagName("body")[0].childNodes:
                shtml += el.toxml()
            script.appendChild(dom.createTextNode(shtml))
        elif stype == "text/javascript":
            src = script.getAttribute("src")
            if len(src) > 0:
                _embed_js(dom, script, rootdirs)

    for css in dom.getElementsByTagName("link"):
        if (css.getAttribute("type") == "text/css"):
            csstext = _embed_css(
                _resolve_path(css.getAttribute("href"), rootdirs), rootdirs)
            ncss = dom.createElement("style")
            ncss.setAttribute("type", "text/css")
            ncss.appendChild(dom.createTextNode(csstext))
            css.parentNode.insertBefore(ncss, css)
            css.parentNode.removeChild(css)

    _embed_images(dom, rootdirs)

    #Save out the new html file
    with open(outfile, "w") as htmlfile:
        serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
        walker = html5lib.treewalkers.getTreeWalker("dom")

        for line in serializer.serialize(walker(dom)):
            htmlfile.write(line.encode("utf-8"))
Esempio n. 22
0
def parse_html(content):
    try:
        document = html5lib.parse(content, namespaceHTMLElements=False)

        if not document:
            # Could not parse
            return content

        # Because html5lib parses like a browser, it will
        # always create head and body tags if they are missing.
        head = document.find("head")
        for file in get_files("htmlScreenshot", "js"):
            SubElement(head, "script", attrib={"src": file['url']})
        # Currently, html5lib strips the doctype, but it's important for correct rendering, so check the original
        # content for the doctype and, if found, prepend it to the content serialized by html5lib
        doctype = None
        try:
            # Now parse the content as a dom tree instead, so that we capture
            # any doctype node as a dom node that we can read.
            tree_builder_dom = html5lib.treebuilders.getTreeBuilder("dom")
            parser_dom = html5lib.HTMLParser(tree_builder_dom,
                                             namespaceHTMLElements=False)
            tree = parser_dom.parse(content)
            # By HTML Spec if doctype is included, it must be the first thing
            # in the document, so it has to be the first child node of the document
            doctype_node = tree.childNodes[0]

            # Check that this node is in fact a doctype node
            if doctype_node.nodeType == doctype_node.DOCUMENT_TYPE_NODE:
                # render to a string by calling the toxml method
                # toxml uses single quotes by default, replace with ""
                doctype = doctype_node.toxml().replace("'", '"')
        except Exception as e:
            logging.warn(
                "Error in HTML5 parsing to determine doctype {}".format(e))

        html = html5lib.serialize(
            document,
            quote_attr_values="always",
            omit_optional_tags=False,
            minimize_boolean_attributes=False,
            use_trailing_solidus=True,
            space_before_trailing_solidus=False,
        )

        if doctype:
            html = doctype + html

        return html
    except html5lib.html5parser.ParseError:
        return content
Esempio n. 23
0
def get_items(location, encoding='UTF-8'):
    """
    Pass in a file or file-like object and get a list of Items present in the
    HTML document.
    """
    dom_builder = html5lib.treebuilders.getTreeBuilder("dom")
    parser = html5lib.HTMLParser(tree=dom_builder)
    
    if (sys.version_info.major == 3):
        tree = parser.parse(location)
    else:
        tree = parser.parse(location, encoding=encoding)
    
    return _find_items(tree)
Esempio n. 24
0
def validate_content(testcase, data, page_descr="unknown page"):
    """
    Validate data as HTML5.

    testcase should be a unittest.TestCase object (or similar).
    page_descr should be a human-readable description of the page being tested.
    """
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    parser.parse(data)
    if parser.errors:
        fh = open("tmp-validation.html", "wb")
        fh.write(data)
        fh.close()
        testcase.fail("Invalid HTML5 produced in %s:\n  %s" % (page_descr, str(parser.errors)))
Esempio n. 25
0
def read_xml(filename, mangle_entities=False):
    """
    Read in a document, returning the ElementTree doc node.
    """
    tree = treebuilders.getTreeBuilder('lxml')
    parser = html5lib.HTMLParser(strict=False, tree=tree)
    doc = html5parser.parse(filename, parser=parser)

    if parser.errors:
        sys.stderr.write('errors in {0}\n'.format(filename))
        for e in parser.errors:
            sys.stderr.write('    {0}\n'.format(e))

    return doc
Esempio n. 26
0
def sanitize_html_string(content):
    """Sanitizes the given html string.

  Raises:
    forms.ValidationError in case of an error.
  """
    try:
        parser = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
        parsed = parser.parseFragment(content, encoding='utf-8')
        cleaned_content = ''.join([tag.toxml() for tag in parsed.childNodes])
    except (HTMLParser.HTMLParseError, html5parser.ParseError) as msg:
        raise forms.ValidationError(msg)

    return cleaned_content
Esempio n. 27
0
 def clean(self, value):
     chars = super(HTMLField, self).clean(value)
     #chars = chars.encode('utf-8') # should really find out where we have decoded input to unicode and do it there instead
     p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead
     s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
     dom_tree = p.parseFragment(chars) #encoding="utf-8")  - unicode input seems to work fine
     
     walker = treewalkers.getTreeWalker("dom")
     stream = walker(dom_tree)
     gen = s.serialize(stream)
     out = ""
     for i in gen:
         out += i
     return out
Esempio n. 28
0
def get_options(template):
    parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
    doc = parser.parse(template)

    options = {}

    media_size = doc.documentElement.attributes.get('data-gbclient-media-size')
    options['media'] = media_size.value if media_size else '62mm'

    media_orientation = doc.documentElement.attributes.get('data-gbclient-orientation')
    if media_orientation:
        options['orientation-requested'] = media_orientation.value

    return options
Esempio n. 29
0
def prepare_template_stream(stream, base_url):
    """Prepares the stream to be stored in the DB"""
    stream = force_unicode(stream) if stream else u''
    tree = treebuilders.getTreeBuilder('lxml')
    parser = html5lib.HTMLParser(tree=tree, namespaceHTMLElements=False)
    document_tree = parser.parse(stream)
    script_elements = document_tree.xpath('//script[@src]')
    for script in script_elements:
        src = script.get('src')
        butter_library = get_butter_library(src)
        if butter_library:
            script.set('src', butter_library)
    make_links_absolute(document_tree, base_url)
    return _serialize_stream(document_tree)
Esempio n. 30
0
def test_th_has_no_css_rules():
    html = HTML_CONTENT

    result = BytesIO()
    pdf = pisaDocument(BytesIO(html.encode('utf-8')), result)

    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    document = parser.parse(html)
    th_element = document.getElementsByTagName("th")[0]
    th_element = CSSDOMElementInterface(th_element)
    attr_name = "background-color"
    rules = pdf.cssCascade.findCSSRulesFor(th_element, attr_name)

    tools.assert_list_equal(rules, [])