def test_prettify(self):
        html_with_script = """
<html>
    <head>
        <title>Hello World!>/title>
        <script>
            $(document).ready(function() {
                $("button").button();
            });
        </script>
    </head>
    <body>
        <div>a brown fox</div>
    <body>
    <scr + ipt>
        $(document).ready(function() {
            $("button").button();
        });
    </scr + ipt>
</html>
"""
        # todo find a smarter way to compare html

        html_with_no_script = "\n<html>\n    <head>\n        <title>Hello World!>/title>\n        \n    </head>\n    <body>\n        <div>a brown fox</div>\n    <body>\n    \n</html>\n"

        parser = KetchlipHTMLParser()
        self.assertEqual(html_with_no_script, parser.prettify(html_with_script))
Esempio n. 2
0
    def indexing(self, result):
        """
        result => {Crawler.CONTENT:html, Crawler.URL:http://.., Crawler.EXPANDED_URL:http://.., Crawler.LINKS:[]}
        """
        try:
            start = time.time()
            DESCRIPTION_MAX_LENGTH = 260
            url = result[Crawler.URL].strip()

            if url in self.lookup_url:
                logger.info("Already indexed " + url)
                return

            parser = KetchlipHTMLParser(result[Crawler.CONTENT])
            title = parser.title()
            text = parser.text()
            description = parser.description(DESCRIPTION_MAX_LENGTH)

            self.lookup_url[url] = [len(self.lookup_url.items()), "", "", ""]
            self.lookup_url[url][self.EXPANDED_URL_POS] = result[Crawler.EXPANDED_URL]
            self.lookup_url[url][self.TITLE_POS] = title
            self.lookup_url[url][self.DESCRIPTION_POS] = description
            self.add_page_to_index(self.index, self.lookup_url[url][self.URL_INDEX_POS], " ".join([title, description, text]))

            if Crawler.LINKS in result:
                self.graph[url] = result[Crawler.LINKS]

            elapsed = (time.time() - start)
            logger.info("Indexed " + url + " in " + "%.2f" % round(elapsed, 2) + " seconds")
            gevent.sleep(0)
        except HTMLParseError, e:
            logger.info("Failed to parse HTML")
 def test_parse_description_should_chomp_(self):
     MAX_LENGTH = 20
     parser = KetchlipHTMLParser(self.html)
     self.assertEqual("This is the content ...", parser.description(MAX_LENGTH))
 def test_parse_description_should_return_empty_string_if_description_is_missing(self):
     parser = KetchlipHTMLParser(self.html_with_empty_head_and_body)
     self.assertEqual("", parser.description())
 def test_parse_description(self):
     parser = KetchlipHTMLParser(self.html)
     self.assertEqual("This is the content meta tag", parser.description())
 def test_parse_text_without_empty__html_content_should_return_empty_string(self):
     parser = KetchlipHTMLParser("")
     self.assertEqual("", parser.text())
 def test_parse_text_without_body_content_should_return_empty_string(self):
     parser = KetchlipHTMLParser(self.html_with_empty_head_and_body)
     self.assertEqual("", parser.text())
 def test_parse_text(self):
     parser = KetchlipHTMLParser(self.html)
     self.assertEqual("inside a p. inside a td. inside another td. inside a div. inside a span.", parser.text())
 def test_parse_title_should_return_empty_string_if_content_is_empty(self):
     parser = KetchlipHTMLParser("")
     self.assertEqual("", parser.title())
 def test_parse_title_should_return_empty_string_if_title_tag_is_missing(self):
     parser = KetchlipHTMLParser(self.html_with_empty_head_and_body)
     self.assertEqual("", parser.title())
 def test_parse_title_should_return_empty_string_if_title_is_empty(self):
     parser = KetchlipHTMLParser(self.html_no_title_text)
     self.assertEqual("", parser.title())
 def test_parse_title(self):
     parser = KetchlipHTMLParser(self.html)
     self.assertEqual("This is the title", parser.title())